pointblank 0.17.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ failure thresholds (using the `Thresholds` class or through shorthands for this
11
11
  `Validate` class has numerous methods for defining validation steps and for obtaining
12
12
  post-interrogation metrics and data.
13
13
 
14
- Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, final_actions: 'FinalActions | None' = None, brief: 'str | bool | None' = None, lang: 'str | None' = None, locale: 'str | None' = None) -> None
14
+ Validate(data: 'IntoDataFrame', reference: 'IntoFrame | None' = None, tbl_name: 'str | None' = None, label: 'str | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, final_actions: 'FinalActions | None' = None, brief: 'str | bool | None' = None, lang: 'str | None' = None, locale: 'str | None' = None) -> None
15
15
 
16
16
  Workflow for defining a set of validations on a table and interrogating for results.
17
17
 
@@ -916,7 +916,7 @@ FinalActions(*args)
916
916
  used to retrieve the summary of the validation results.
917
917
 
918
918
 
919
- Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'any | None' = None, **kwargs)
919
+ Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'Any | None' = None, **kwargs)
920
920
  Definition of a schema object.
921
921
 
922
922
  The schema object defines the structure of a table. Once it is defined, the object can be used
@@ -1167,7 +1167,7 @@ Definition of a schema object.
1167
1167
  `Schema` object is used in a validation workflow.
1168
1168
 
1169
1169
 
1170
- DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None, verify_ssl: 'bool' = True) -> None
1170
+ DraftValidation(data: 'Any', model: 'str', api_key: 'str | None' = None, verify_ssl: 'bool' = True) -> None
1171
1171
 
1172
1172
  Draft a validation plan for a given table using an LLM.
1173
1173
 
@@ -1382,7 +1382,7 @@ Validation steps can be thought of as sequential validations on the target
1382
1382
  data. We call `Validate`'s validation methods to build up a validation plan: a collection of steps
1383
1383
  that, in the aggregate, provides good validation coverage.
1384
1384
 
1385
- col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
1385
+ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
1386
1386
 
1387
1387
  Are column data greater than a fixed value or data in another column?
1388
1388
 
@@ -1607,7 +1607,7 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
1607
1607
  - Row 3: `c` is `2` and `b` is `2`.
1608
1608
 
1609
1609
 
1610
- col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
1610
+ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
1611
1611
 
1612
1612
  Are column data less than a fixed value or data in another column?
1613
1613
 
@@ -1832,7 +1832,7 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
1832
1832
  - Row 2: `b` is `1` and `c` is `1`.
1833
1833
 
1834
1834
 
1835
- col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
1835
+ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
1836
1836
 
1837
1837
  Are column data greater than or equal to a fixed value or data in another column?
1838
1838
 
@@ -2057,7 +2057,7 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
2057
2057
  - Row 4: `b` is `3` and `c` is `4`.
2058
2058
 
2059
2059
 
2060
- col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2060
+ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2061
2061
 
2062
2062
  Are column data less than or equal to a fixed value or data in another column?
2063
2063
 
@@ -2282,7 +2282,7 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
2282
2282
  - Row 4: `c` is `3` and `b` is `2`.
2283
2283
 
2284
2284
 
2285
- col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2285
+ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2286
2286
 
2287
2287
  Are column data equal to a fixed value or data in another column?
2288
2288
 
@@ -2505,7 +2505,7 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
2505
2505
  - Row 5: `a` is `5` and `b` is `4`.
2506
2506
 
2507
2507
 
2508
- col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2508
+ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2509
2509
 
2510
2510
  Are column data not equal to a fixed value or data in another column?
2511
2511
 
@@ -2726,7 +2726,7 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
2726
2726
  0 and 4, where `a` is `5` and `b` is `5` in both cases (i.e., they are equal to each other).
2727
2727
 
2728
2728
 
2729
- col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2729
+ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2730
2730
 
2731
2731
  Do column data lie between two specified values or data in other columns?
2732
2732
 
@@ -2971,7 +2971,7 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
2971
2971
  - Row 4: `b` is `8` but the bounds are `3` (`a`) and `7` (`c`).
2972
2972
 
2973
2973
 
2974
- col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2974
+ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
2975
2975
 
2976
2976
  Do column data lie outside of two specified values or data in other columns?
2977
2977
 
@@ -3216,7 +3216,7 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
3216
3216
  - Row 5: `b` is `6` and the bounds are `5` (`a`) and `7` (`c`).
3217
3217
 
3218
3218
 
3219
- col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3219
+ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3220
3220
 
3221
3221
  Validate whether column values are in a set of values.
3222
3222
 
@@ -3463,7 +3463,7 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
3463
3463
  specified set.
3464
3464
 
3465
3465
 
3466
- col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3466
+ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3467
3467
 
3468
3468
  Validate whether column values are not in a set of values.
3469
3469
 
@@ -3687,7 +3687,7 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
3687
3687
  statuses in the `InvalidStatus` enum.
3688
3688
 
3689
3689
 
3690
- col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3690
+ col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3691
3691
 
3692
3692
  Are column data increasing by row?
3693
3693
 
@@ -3815,7 +3815,7 @@ col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector |
3815
3815
  ```
3816
3816
 
3817
3817
 
3818
- col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3818
+ col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3819
3819
 
3820
3820
  Are column data decreasing by row?
3821
3821
 
@@ -3943,7 +3943,7 @@ col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector |
3943
3943
  ```
3944
3944
 
3945
3945
 
3946
- col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3946
+ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3947
3947
 
3948
3948
  Validate whether values in a column are Null.
3949
3949
 
@@ -4129,7 +4129,7 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
4129
4129
  two non-Null values in column `b`.
4130
4130
 
4131
4131
 
4132
- col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4132
+ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4133
4133
 
4134
4134
  Validate whether values in a column are not Null.
4135
4135
 
@@ -4315,7 +4315,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
4315
4315
  two Null values in column `b`.
4316
4316
 
4317
4317
 
4318
- col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4318
+ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4319
4319
 
4320
4320
  Validate whether column values match a regular expression pattern.
4321
4321
 
@@ -4511,7 +4511,7 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
4511
4511
  string values of rows 1 and 2 in column `b`.
4512
4512
 
4513
4513
 
4514
- col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4514
+ col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4515
4515
 
4516
4516
  Validate whether column values fit within a specification.
4517
4517
 
@@ -4729,7 +4729,7 @@ col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector |
4729
4729
  The validation table shows that one test unit failed (the invalid email address in row 3).
4730
4730
 
4731
4731
 
4732
- col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4732
+ col_vals_expr(self, expr: 'Any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4733
4733
 
4734
4734
  Validate column values using a custom expression.
4735
4735
 
@@ -4900,7 +4900,2653 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
4900
4900
  by using `col_vals_expr()`. All test units passed, with no failing test units.
4901
4901
 
4902
4902
 
4903
- rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4903
+ col_sum_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
4904
+ Does the column sum satisfy a greater than comparison?
4905
+
4906
+ The `col_sum_gt()` validation method checks whether the sum of values in a column
4907
+ is greater than a specified `value=`. This is an aggregation-based validation where the entire
4908
+ column is reduced to a single sum value that is then compared against the target. The
4909
+ comparison used in this function is `sum(column) > value`.
4910
+
4911
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
4912
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
4913
+ the comparison) or fails completely.
4914
+
4915
+ Parameters
4916
+ ----------
4917
+ columns
4918
+ A single column or a list of columns to validate. If multiple columns are supplied,
4919
+ there will be a separate validation step generated for each column. The columns must
4920
+ contain numeric data for the sum to be computed.
4921
+ value
4922
+ The value to compare the column sum against. This can be: (1) a numeric literal
4923
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
4924
+ whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
4925
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
4926
+ `None` to automatically compare against the same column in reference data (shorthand for
4927
+ `ref(column_name)` when reference data is set).
4928
+ tol
4929
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
4930
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
4931
+ a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
4932
+ `col_sum_gt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
4933
+ target value and still pass validation.
4934
+ thresholds
4935
+ Failure threshold levels so that the validation step can react accordingly when
4936
+ failing test units are level. Since this is an aggregation-based validation with only
4937
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
4938
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
4939
+ acceptable.
4940
+ brief
4941
+ An optional brief description of the validation step that will be displayed in the
4942
+ reporting table. You can use the templating elements like `"{step}"` to insert
4943
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
4944
+ the entire brief will be automatically generated. If `None` (the default) then there
4945
+ won't be a brief.
4946
+ actions
4947
+ Optional actions to take when the validation step meets or exceeds any set threshold
4948
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
4949
+ define the actions.
4950
+ active
4951
+ A boolean value indicating whether the validation step should be active. Using `False`
4952
+ will make the validation step inactive (still reporting its presence and keeping indexes
4953
+ for the steps unchanged).
4954
+
4955
+ Returns
4956
+ -------
4957
+ Validate
4958
+ The `Validate` object with the added validation step.
4959
+
4960
+ Using Reference Data
4961
+ --------------------
4962
+ The `col_sum_gt()` method supports comparing column aggregations against reference data. This
4963
+ is useful for validating that statistical properties remain consistent across different
4964
+ versions of a dataset, or for comparing current data against historical baselines.
4965
+
4966
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
4967
+
4968
+ ```python
4969
+ validation = (
4970
+ pb.Validate(data=current_data, reference=baseline_data)
4971
+ .col_sum_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
4972
+ .interrogate()
4973
+ )
4974
+ ```
4975
+
4976
+ When `value=None` and reference data is set, the method automatically compares against the
4977
+ same column in the reference data. You can also explicitly specify reference columns using
4978
+ the `ref()` helper:
4979
+
4980
+ ```python
4981
+ .col_sum_gt(columns="revenue", value=pb.ref("baseline_revenue"))
4982
+ ```
4983
+
4984
+ Understanding Tolerance
4985
+ -----------------------
4986
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
4987
+ floating-point aggregations where exact equality is often unreliable.
4988
+
4989
+ The `tol=` parameter expands the acceptable range for the comparison. For
4990
+ `col_sum_gt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
4991
+ target value and still pass validation.
4992
+
4993
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
4994
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
4995
+ shifts the comparison boundary.
4996
+
4997
+ Thresholds
4998
+ ----------
4999
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5000
+ step. If they are set here at the step level, these thresholds will override any thresholds
5001
+ set at the global level in `Validate(thresholds=...)`.
5002
+
5003
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
5004
+ validations operate on a single test unit (the aggregated value), threshold values are
5005
+ typically set as absolute counts:
5006
+
5007
+ - `thresholds=1` means any failure triggers a 'warning'
5008
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
5009
+
5010
+ Thresholds can be defined using one of these input schemes:
5011
+
5012
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5013
+ thresholds)
5014
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5015
+ the 'error' level, and position `2` is the 'critical' level
5016
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5017
+ 'critical'
5018
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5019
+ for the 'warning' level only
5020
+
5021
+ Examples
5022
+ --------
5023
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
5024
+ shown below:
5025
+
5026
+ ```python
5027
+ import pointblank as pb
5028
+ import polars as pl
5029
+
5030
+ tbl = pl.DataFrame(
5031
+ {
5032
+ "a": [1, 2, 3, 4, 5],
5033
+ "b": [2, 2, 2, 2, 2],
5034
+ }
5035
+ )
5036
+
5037
+ pb.preview(tbl)
5038
+ ```
5039
+
5040
+ Let's validate that the sum of column `a` is greater than `15`:
5041
+
5042
+ ```python
5043
+ validation = (
5044
+ pb.Validate(data=tbl)
5045
+ .col_sum_gt(columns="a", value=15)
5046
+ .interrogate()
5047
+ )
5048
+
5049
+ validation
5050
+ ```
5051
+
5052
+ The validation result shows whether the sum comparison passed or failed. Since this
5053
+ is an aggregation-based validation, there is exactly one test unit per column.
5054
+
5055
+ When validating multiple columns, each column gets its own validation step:
5056
+
5057
+ ```python
5058
+ validation = (
5059
+ pb.Validate(data=tbl)
5060
+ .col_sum_gt(columns=["a", "b"], value=15)
5061
+ .interrogate()
5062
+ )
5063
+
5064
+ validation
5065
+ ```
5066
+
5067
+ Using tolerance for flexible comparisons:
5068
+
5069
+ ```python
5070
+ validation = (
5071
+ pb.Validate(data=tbl)
5072
+ .col_sum_gt(columns="a", value=15, tol=1.0)
5073
+ .interrogate()
5074
+ )
5075
+
5076
+ validation
5077
+ ```
5078
+
5079
+ col_sum_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
5080
+ Does the column sum satisfy a less than comparison?
5081
+
5082
+ The `col_sum_lt()` validation method checks whether the sum of values in a column
5083
+ is less than a specified `value=`. This is an aggregation-based validation where the entire
5084
+ column is reduced to a single sum value that is then compared against the target. The
5085
+ comparison used in this function is `sum(column) < value`.
5086
+
5087
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
5088
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
5089
+ the comparison) or fails completely.
5090
+
5091
+ Parameters
5092
+ ----------
5093
+ columns
5094
+ A single column or a list of columns to validate. If multiple columns are supplied,
5095
+ there will be a separate validation step generated for each column. The columns must
5096
+ contain numeric data for the sum to be computed.
5097
+ value
5098
+ The value to compare the column sum against. This can be: (1) a numeric literal
5099
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
5100
+ whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
5101
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
5102
+ `None` to automatically compare against the same column in reference data (shorthand for
5103
+ `ref(column_name)` when reference data is set).
5104
+ tol
5105
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
5106
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
5107
+ a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
5108
+ `col_sum_lt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
5109
+ target value and still pass validation.
5110
+ thresholds
5111
+ Failure threshold levels so that the validation step can react accordingly when
5112
+ failing test units are level. Since this is an aggregation-based validation with only
5113
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
5114
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
5115
+ acceptable.
5116
+ brief
5117
+ An optional brief description of the validation step that will be displayed in the
5118
+ reporting table. You can use the templating elements like `"{step}"` to insert
5119
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
5120
+ the entire brief will be automatically generated. If `None` (the default) then there
5121
+ won't be a brief.
5122
+ actions
5123
+ Optional actions to take when the validation step meets or exceeds any set threshold
5124
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
5125
+ define the actions.
5126
+ active
5127
+ A boolean value indicating whether the validation step should be active. Using `False`
5128
+ will make the validation step inactive (still reporting its presence and keeping indexes
5129
+ for the steps unchanged).
5130
+
5131
+ Returns
5132
+ -------
5133
+ Validate
5134
+ The `Validate` object with the added validation step.
5135
+
5136
+ Using Reference Data
5137
+ --------------------
5138
+ The `col_sum_lt()` method supports comparing column aggregations against reference data. This
5139
+ is useful for validating that statistical properties remain consistent across different
5140
+ versions of a dataset, or for comparing current data against historical baselines.
5141
+
5142
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
5143
+
5144
+ ```python
5145
+ validation = (
5146
+ pb.Validate(data=current_data, reference=baseline_data)
5147
+ .col_sum_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
5148
+ .interrogate()
5149
+ )
5150
+ ```
5151
+
5152
+ When `value=None` and reference data is set, the method automatically compares against the
5153
+ same column in the reference data. You can also explicitly specify reference columns using
5154
+ the `ref()` helper:
5155
+
5156
+ ```python
5157
+ .col_sum_lt(columns="revenue", value=pb.ref("baseline_revenue"))
5158
+ ```
5159
+
5160
+ Understanding Tolerance
5161
+ -----------------------
5162
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
5163
+ floating-point aggregations where exact equality is often unreliable.
5164
+
5165
+ The `tol=` parameter expands the acceptable range for the comparison. For
5166
+ `col_sum_lt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
5167
+ target value and still pass validation.
5168
+
5169
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
5170
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
5171
+ shifts the comparison boundary.
5172
+
5173
+ Thresholds
5174
+ ----------
5175
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5176
+ step. If they are set here at the step level, these thresholds will override any thresholds
5177
+ set at the global level in `Validate(thresholds=...)`.
5178
+
5179
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
5180
+ validations operate on a single test unit (the aggregated value), threshold values are
5181
+ typically set as absolute counts:
5182
+
5183
+ - `thresholds=1` means any failure triggers a 'warning'
5184
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
5185
+
5186
+ Thresholds can be defined using one of these input schemes:
5187
+
5188
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5189
+ thresholds)
5190
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5191
+ the 'error' level, and position `2` is the 'critical' level
5192
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5193
+ 'critical'
5194
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5195
+ for the 'warning' level only
5196
+
5197
+ Examples
5198
+ --------
5199
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
5200
+ shown below:
5201
+
5202
+ ```python
5203
+ import pointblank as pb
5204
+ import polars as pl
5205
+
5206
+ tbl = pl.DataFrame(
5207
+ {
5208
+ "a": [1, 2, 3, 4, 5],
5209
+ "b": [2, 2, 2, 2, 2],
5210
+ }
5211
+ )
5212
+
5213
+ pb.preview(tbl)
5214
+ ```
5215
+
5216
+ Let's validate that the sum of column `a` is less than `15`:
5217
+
5218
+ ```python
5219
+ validation = (
5220
+ pb.Validate(data=tbl)
5221
+ .col_sum_lt(columns="a", value=15)
5222
+ .interrogate()
5223
+ )
5224
+
5225
+ validation
5226
+ ```
5227
+
5228
+ The validation result shows whether the sum comparison passed or failed. Since this
5229
+ is an aggregation-based validation, there is exactly one test unit per column.
5230
+
5231
+ When validating multiple columns, each column gets its own validation step:
5232
+
5233
+ ```python
5234
+ validation = (
5235
+ pb.Validate(data=tbl)
5236
+ .col_sum_lt(columns=["a", "b"], value=15)
5237
+ .interrogate()
5238
+ )
5239
+
5240
+ validation
5241
+ ```
5242
+
5243
+ Using tolerance for flexible comparisons:
5244
+
5245
+ ```python
5246
+ validation = (
5247
+ pb.Validate(data=tbl)
5248
+ .col_sum_lt(columns="a", value=15, tol=1.0)
5249
+ .interrogate()
5250
+ )
5251
+
5252
+ validation
5253
+ ```
5254
+
5255
+ col_sum_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
5256
+ Does the column sum satisfy a greater than or equal to comparison?
5257
+
5258
+ The `col_sum_ge()` validation method checks whether the sum of values in a column
5259
+ is at least a specified `value=`. This is an aggregation-based validation where the entire
5260
+ column is reduced to a single sum value that is then compared against the target. The
5261
+ comparison used in this function is `sum(column) >= value`.
5262
+
5263
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
5264
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
5265
+ the comparison) or fails completely.
5266
+
5267
+ Parameters
5268
+ ----------
5269
+ columns
5270
+ A single column or a list of columns to validate. If multiple columns are supplied,
5271
+ there will be a separate validation step generated for each column. The columns must
5272
+ contain numeric data for the sum to be computed.
5273
+ value
5274
+ The value to compare the column sum against. This can be: (1) a numeric literal
5275
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
5276
+ whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
5277
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
5278
+ `None` to automatically compare against the same column in reference data (shorthand for
5279
+ `ref(column_name)` when reference data is set).
5280
+ tol
5281
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
5282
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
5283
+ a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
5284
+ `col_sum_ge()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
5285
+ target value and still pass validation.
5286
+ thresholds
5287
+ Failure threshold levels so that the validation step can react accordingly when
5288
+ failing test units are level. Since this is an aggregation-based validation with only
5289
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
5290
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
5291
+ acceptable.
5292
+ brief
5293
+ An optional brief description of the validation step that will be displayed in the
5294
+ reporting table. You can use the templating elements like `"{step}"` to insert
5295
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
5296
+ the entire brief will be automatically generated. If `None` (the default) then there
5297
+ won't be a brief.
5298
+ actions
5299
+ Optional actions to take when the validation step meets or exceeds any set threshold
5300
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
5301
+ define the actions.
5302
+ active
5303
+ A boolean value indicating whether the validation step should be active. Using `False`
5304
+ will make the validation step inactive (still reporting its presence and keeping indexes
5305
+ for the steps unchanged).
5306
+
5307
+ Returns
5308
+ -------
5309
+ Validate
5310
+ The `Validate` object with the added validation step.
5311
+
5312
+ Using Reference Data
5313
+ --------------------
5314
+ The `col_sum_ge()` method supports comparing column aggregations against reference data. This
5315
+ is useful for validating that statistical properties remain consistent across different
5316
+ versions of a dataset, or for comparing current data against historical baselines.
5317
+
5318
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
5319
+
5320
+ ```python
5321
+ validation = (
5322
+ pb.Validate(data=current_data, reference=baseline_data)
5323
+ .col_sum_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
5324
+ .interrogate()
5325
+ )
5326
+ ```
5327
+
5328
+ When `value=None` and reference data is set, the method automatically compares against the
5329
+ same column in the reference data. You can also explicitly specify reference columns using
5330
+ the `ref()` helper:
5331
+
5332
+ ```python
5333
+ .col_sum_ge(columns="revenue", value=pb.ref("baseline_revenue"))
5334
+ ```
5335
+
5336
+ Understanding Tolerance
5337
+ -----------------------
5338
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
5339
+ floating-point aggregations where exact equality is often unreliable.
5340
+
5341
+ The `tol=` parameter expands the acceptable range for the comparison. For
5342
+ `col_sum_ge()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
5343
+ target value and still pass validation.
5344
+
5345
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
5346
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
5347
+ shifts the comparison boundary.
5348
+
5349
+ Thresholds
5350
+ ----------
5351
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5352
+ step. If they are set here at the step level, these thresholds will override any thresholds
5353
+ set at the global level in `Validate(thresholds=...)`.
5354
+
5355
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
5356
+ validations operate on a single test unit (the aggregated value), threshold values are
5357
+ typically set as absolute counts:
5358
+
5359
+ - `thresholds=1` means any failure triggers a 'warning'
5360
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
5361
+
5362
+ Thresholds can be defined using one of these input schemes:
5363
+
5364
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5365
+ thresholds)
5366
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5367
+ the 'error' level, and position `2` is the 'critical' level
5368
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5369
+ 'critical'
5370
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5371
+ for the 'warning' level only
5372
+
5373
+ Examples
5374
+ --------
5375
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
5376
+ shown below:
5377
+
5378
+ ```python
5379
+ import pointblank as pb
5380
+ import polars as pl
5381
+
5382
+ tbl = pl.DataFrame(
5383
+ {
5384
+ "a": [1, 2, 3, 4, 5],
5385
+ "b": [2, 2, 2, 2, 2],
5386
+ }
5387
+ )
5388
+
5389
+ pb.preview(tbl)
5390
+ ```
5391
+
5392
+ Let's validate that the sum of column `a` is at least `15`:
5393
+
5394
+ ```python
5395
+ validation = (
5396
+ pb.Validate(data=tbl)
5397
+ .col_sum_ge(columns="a", value=15)
5398
+ .interrogate()
5399
+ )
5400
+
5401
+ validation
5402
+ ```
5403
+
5404
+ The validation result shows whether the sum comparison passed or failed. Since this
5405
+ is an aggregation-based validation, there is exactly one test unit per column.
5406
+
5407
+ When validating multiple columns, each column gets its own validation step:
5408
+
5409
+ ```python
5410
+ validation = (
5411
+ pb.Validate(data=tbl)
5412
+ .col_sum_ge(columns=["a", "b"], value=15)
5413
+ .interrogate()
5414
+ )
5415
+
5416
+ validation
5417
+ ```
5418
+
5419
+ Using tolerance for flexible comparisons:
5420
+
5421
+ ```python
5422
+ validation = (
5423
+ pb.Validate(data=tbl)
5424
+ .col_sum_ge(columns="a", value=15, tol=1.0)
5425
+ .interrogate()
5426
+ )
5427
+
5428
+ validation
5429
+ ```
5430
+
5431
+ col_sum_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
5432
+ Does the column sum satisfy a less than or equal to comparison?
5433
+
5434
+ The `col_sum_le()` validation method checks whether the sum of values in a column
5435
+ is at most a specified `value=`. This is an aggregation-based validation where the entire
5436
+ column is reduced to a single sum value that is then compared against the target. The
5437
+ comparison used in this function is `sum(column) <= value`.
5438
+
5439
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
5440
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
5441
+ the comparison) or fails completely.
5442
+
5443
+ Parameters
5444
+ ----------
5445
+ columns
5446
+ A single column or a list of columns to validate. If multiple columns are supplied,
5447
+ there will be a separate validation step generated for each column. The columns must
5448
+ contain numeric data for the sum to be computed.
5449
+ value
5450
+ The value to compare the column sum against. This can be: (1) a numeric literal
5451
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
5452
+ whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
5453
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
5454
+ `None` to automatically compare against the same column in reference data (shorthand for
5455
+ `ref(column_name)` when reference data is set).
5456
+ tol
5457
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
5458
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
5459
+ a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
5460
+ `col_sum_le()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
5461
+ target value and still pass validation.
5462
+ thresholds
5463
+ Failure threshold levels so that the validation step can react accordingly when
5464
+ failing test units are level. Since this is an aggregation-based validation with only
5465
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
5466
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
5467
+ acceptable.
5468
+ brief
5469
+ An optional brief description of the validation step that will be displayed in the
5470
+ reporting table. You can use the templating elements like `"{step}"` to insert
5471
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
5472
+ the entire brief will be automatically generated. If `None` (the default) then there
5473
+ won't be a brief.
5474
+ actions
5475
+ Optional actions to take when the validation step meets or exceeds any set threshold
5476
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
5477
+ define the actions.
5478
+ active
5479
+ A boolean value indicating whether the validation step should be active. Using `False`
5480
+ will make the validation step inactive (still reporting its presence and keeping indexes
5481
+ for the steps unchanged).
5482
+
5483
+ Returns
5484
+ -------
5485
+ Validate
5486
+ The `Validate` object with the added validation step.
5487
+
5488
+ Using Reference Data
5489
+ --------------------
5490
+ The `col_sum_le()` method supports comparing column aggregations against reference data. This
5491
+ is useful for validating that statistical properties remain consistent across different
5492
+ versions of a dataset, or for comparing current data against historical baselines.
5493
+
5494
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
5495
+
5496
+ ```python
5497
+ validation = (
5498
+ pb.Validate(data=current_data, reference=baseline_data)
5499
+ .col_sum_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
5500
+ .interrogate()
5501
+ )
5502
+ ```
5503
+
5504
+ When `value=None` and reference data is set, the method automatically compares against the
5505
+ same column in the reference data. You can also explicitly specify reference columns using
5506
+ the `ref()` helper:
5507
+
5508
+ ```python
5509
+ .col_sum_le(columns="revenue", value=pb.ref("baseline_revenue"))
5510
+ ```
5511
+
5512
+ Understanding Tolerance
5513
+ -----------------------
5514
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
5515
+ floating-point aggregations where exact equality is often unreliable.
5516
+
5517
+ The `tol=` parameter expands the acceptable range for the comparison. For
5518
+ `col_sum_le()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
5519
+ target value and still pass validation.
5520
+
5521
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
5522
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
5523
+ shifts the comparison boundary.
5524
+
5525
+ Thresholds
5526
+ ----------
5527
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5528
+ step. If they are set here at the step level, these thresholds will override any thresholds
5529
+ set at the global level in `Validate(thresholds=...)`.
5530
+
5531
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
5532
+ validations operate on a single test unit (the aggregated value), threshold values are
5533
+ typically set as absolute counts:
5534
+
5535
+ - `thresholds=1` means any failure triggers a 'warning'
5536
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
5537
+
5538
+ Thresholds can be defined using one of these input schemes:
5539
+
5540
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5541
+ thresholds)
5542
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5543
+ the 'error' level, and position `2` is the 'critical' level
5544
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5545
+ 'critical'
5546
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5547
+ for the 'warning' level only
5548
+
5549
+ Examples
5550
+ --------
5551
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
5552
+ shown below:
5553
+
5554
+ ```python
5555
+ import pointblank as pb
5556
+ import polars as pl
5557
+
5558
+ tbl = pl.DataFrame(
5559
+ {
5560
+ "a": [1, 2, 3, 4, 5],
5561
+ "b": [2, 2, 2, 2, 2],
5562
+ }
5563
+ )
5564
+
5565
+ pb.preview(tbl)
5566
+ ```
5567
+
5568
+ Let's validate that the sum of column `a` is at most `15`:
5569
+
5570
+ ```python
5571
+ validation = (
5572
+ pb.Validate(data=tbl)
5573
+ .col_sum_le(columns="a", value=15)
5574
+ .interrogate()
5575
+ )
5576
+
5577
+ validation
5578
+ ```
5579
+
5580
+ The validation result shows whether the sum comparison passed or failed. Since this
5581
+ is an aggregation-based validation, there is exactly one test unit per column.
5582
+
5583
+ When validating multiple columns, each column gets its own validation step:
5584
+
5585
+ ```python
5586
+ validation = (
5587
+ pb.Validate(data=tbl)
5588
+ .col_sum_le(columns=["a", "b"], value=15)
5589
+ .interrogate()
5590
+ )
5591
+
5592
+ validation
5593
+ ```
5594
+
5595
+ Using tolerance for flexible comparisons:
5596
+
5597
+ ```python
5598
+ validation = (
5599
+ pb.Validate(data=tbl)
5600
+ .col_sum_le(columns="a", value=15, tol=1.0)
5601
+ .interrogate()
5602
+ )
5603
+
5604
+ validation
5605
+ ```
5606
+
5607
+ col_sum_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
5608
+ Does the column sum satisfy an equal to comparison?
5609
+
5610
+ The `col_sum_eq()` validation method checks whether the sum of values in a column
5611
+ equals a specified `value=`. This is an aggregation-based validation where the entire
5612
+ column is reduced to a single sum value that is then compared against the target. The
5613
+ comparison used in this function is `sum(column) == value`.
5614
+
5615
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
5616
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
5617
+ the comparison) or fails completely.
5618
+
5619
+ Parameters
5620
+ ----------
5621
+ columns
5622
+ A single column or a list of columns to validate. If multiple columns are supplied,
5623
+ there will be a separate validation step generated for each column. The columns must
5624
+ contain numeric data for the sum to be computed.
5625
+ value
5626
+ The value to compare the column sum against. This can be: (1) a numeric literal
5627
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
5628
+ whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
5629
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
5630
+ `None` to automatically compare against the same column in reference data (shorthand for
5631
+ `ref(column_name)` when reference data is set).
5632
+ tol
5633
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
5634
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
5635
+ a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_sum_eq()` since exact equality
5636
+ comparisons on floating-point aggregations can be problematic due to numerical precision.
5637
+ Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
5638
+ floating-point arithmetic.
5639
+ thresholds
5640
+ Failure threshold levels so that the validation step can react accordingly when
5641
+ failing test units are level. Since this is an aggregation-based validation with only
5642
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
5643
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
5644
+ acceptable.
5645
+ brief
5646
+ An optional brief description of the validation step that will be displayed in the
5647
+ reporting table. You can use the templating elements like `"{step}"` to insert
5648
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
5649
+ the entire brief will be automatically generated. If `None` (the default) then there
5650
+ won't be a brief.
5651
+ actions
5652
+ Optional actions to take when the validation step meets or exceeds any set threshold
5653
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
5654
+ define the actions.
5655
+ active
5656
+ A boolean value indicating whether the validation step should be active. Using `False`
5657
+ will make the validation step inactive (still reporting its presence and keeping indexes
5658
+ for the steps unchanged).
5659
+
5660
+ Returns
5661
+ -------
5662
+ Validate
5663
+ The `Validate` object with the added validation step.
5664
+
5665
+ Using Reference Data
5666
+ --------------------
5667
+ The `col_sum_eq()` method supports comparing column aggregations against reference data. This
5668
+ is useful for validating that statistical properties remain consistent across different
5669
+ versions of a dataset, or for comparing current data against historical baselines.
5670
+
5671
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
5672
+
5673
+ ```python
5674
+ validation = (
5675
+ pb.Validate(data=current_data, reference=baseline_data)
5676
+ .col_sum_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
5677
+ .interrogate()
5678
+ )
5679
+ ```
5680
+
5681
+ When `value=None` and reference data is set, the method automatically compares against the
5682
+ same column in the reference data. You can also explicitly specify reference columns using
5683
+ the `ref()` helper:
5684
+
5685
+ ```python
5686
+ .col_sum_eq(columns="revenue", value=pb.ref("baseline_revenue"))
5687
+ ```
5688
+
5689
+ Understanding Tolerance
5690
+ -----------------------
5691
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
5692
+ floating-point aggregations where exact equality is often unreliable.
5693
+
5694
+ The `tol=` parameter is particularly useful with `col_sum_eq()` since exact equality
5695
+ comparisons on floating-point aggregations can be problematic due to numerical precision.
5696
+ Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
5697
+ floating-point arithmetic.
5698
+
5699
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
5700
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
5701
+ shifts the comparison boundary.
5702
+
5703
+ Thresholds
5704
+ ----------
5705
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5706
+ step. If they are set here at the step level, these thresholds will override any thresholds
5707
+ set at the global level in `Validate(thresholds=...)`.
5708
+
5709
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
5710
+ validations operate on a single test unit (the aggregated value), threshold values are
5711
+ typically set as absolute counts:
5712
+
5713
+ - `thresholds=1` means any failure triggers a 'warning'
5714
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
5715
+
5716
+ Thresholds can be defined using one of these input schemes:
5717
+
5718
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5719
+ thresholds)
5720
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5721
+ the 'error' level, and position `2` is the 'critical' level
5722
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5723
+ 'critical'
5724
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5725
+ for the 'warning' level only
5726
+
5727
+ Examples
5728
+ --------
5729
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
5730
+ shown below:
5731
+
5732
+ ```python
5733
+ import pointblank as pb
5734
+ import polars as pl
5735
+
5736
+ tbl = pl.DataFrame(
5737
+ {
5738
+ "a": [1, 2, 3, 4, 5],
5739
+ "b": [2, 2, 2, 2, 2],
5740
+ }
5741
+ )
5742
+
5743
+ pb.preview(tbl)
5744
+ ```
5745
+
5746
+ Let's validate that the sum of column `a` equals `15`:
5747
+
5748
+ ```python
5749
+ validation = (
5750
+ pb.Validate(data=tbl)
5751
+ .col_sum_eq(columns="a", value=15)
5752
+ .interrogate()
5753
+ )
5754
+
5755
+ validation
5756
+ ```
5757
+
5758
+ The validation result shows whether the sum comparison passed or failed. Since this
5759
+ is an aggregation-based validation, there is exactly one test unit per column.
5760
+
5761
+ When validating multiple columns, each column gets its own validation step:
5762
+
5763
+ ```python
5764
+ validation = (
5765
+ pb.Validate(data=tbl)
5766
+ .col_sum_eq(columns=["a", "b"], value=15)
5767
+ .interrogate()
5768
+ )
5769
+
5770
+ validation
5771
+ ```
5772
+
5773
+ Using tolerance for flexible comparisons:
5774
+
5775
+ ```python
5776
+ validation = (
5777
+ pb.Validate(data=tbl)
5778
+ .col_sum_eq(columns="a", value=15, tol=1.0)
5779
+ .interrogate()
5780
+ )
5781
+
5782
+ validation
5783
+ ```
5784
+
5785
+ col_avg_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
5786
+ Does the column average satisfy a greater than comparison?
5787
+
5788
+ The `col_avg_gt()` validation method checks whether the average of values in a column
5789
+ is greater than a specified `value=`. This is an aggregation-based validation where the entire
5790
+ column is reduced to a single average value that is then compared against the target. The
5791
+ comparison used in this function is `average(column) > value`.
5792
+
5793
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
5794
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
5795
+ the comparison) or fails completely.
5796
+
5797
+ Parameters
5798
+ ----------
5799
+ columns
5800
+ A single column or a list of columns to validate. If multiple columns are supplied,
5801
+ there will be a separate validation step generated for each column. The columns must
5802
+ contain numeric data for the average to be computed.
5803
+ value
5804
+ The value to compare the column average against. This can be: (1) a numeric literal
5805
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
5806
+ whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
5807
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
5808
+ `None` to automatically compare against the same column in reference data (shorthand for
5809
+ `ref(column_name)` when reference data is set).
5810
+ tol
5811
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
5812
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
5813
+ a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
5814
+ `col_avg_gt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
5815
+ target value and still pass validation.
5816
+ thresholds
5817
+ Failure threshold levels so that the validation step can react accordingly when
5818
+ failing test units are level. Since this is an aggregation-based validation with only
5819
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
5820
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
5821
+ acceptable.
5822
+ brief
5823
+ An optional brief description of the validation step that will be displayed in the
5824
+ reporting table. You can use the templating elements like `"{step}"` to insert
5825
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
5826
+ the entire brief will be automatically generated. If `None` (the default) then there
5827
+ won't be a brief.
5828
+ actions
5829
+ Optional actions to take when the validation step meets or exceeds any set threshold
5830
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
5831
+ define the actions.
5832
+ active
5833
+ A boolean value indicating whether the validation step should be active. Using `False`
5834
+ will make the validation step inactive (still reporting its presence and keeping indexes
5835
+ for the steps unchanged).
5836
+
5837
+ Returns
5838
+ -------
5839
+ Validate
5840
+ The `Validate` object with the added validation step.
5841
+
5842
+ Using Reference Data
5843
+ --------------------
5844
+ The `col_avg_gt()` method supports comparing column aggregations against reference data. This
5845
+ is useful for validating that statistical properties remain consistent across different
5846
+ versions of a dataset, or for comparing current data against historical baselines.
5847
+
5848
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
5849
+
5850
+ ```python
5851
+ validation = (
5852
+ pb.Validate(data=current_data, reference=baseline_data)
5853
+ .col_avg_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
5854
+ .interrogate()
5855
+ )
5856
+ ```
5857
+
5858
+ When `value=None` and reference data is set, the method automatically compares against the
5859
+ same column in the reference data. You can also explicitly specify reference columns using
5860
+ the `ref()` helper:
5861
+
5862
+ ```python
5863
+ .col_avg_gt(columns="revenue", value=pb.ref("baseline_revenue"))
5864
+ ```
5865
+
5866
+ Understanding Tolerance
5867
+ -----------------------
5868
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
5869
+ floating-point aggregations where exact equality is often unreliable.
5870
+
5871
+ The `tol=` parameter expands the acceptable range for the comparison. For
5872
+ `col_avg_gt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
5873
+ target value and still pass validation.
5874
+
5875
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
5876
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
5877
+ shifts the comparison boundary.
5878
+
5879
+ Thresholds
5880
+ ----------
5881
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5882
+ step. If they are set here at the step level, these thresholds will override any thresholds
5883
+ set at the global level in `Validate(thresholds=...)`.
5884
+
5885
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
5886
+ validations operate on a single test unit (the aggregated value), threshold values are
5887
+ typically set as absolute counts:
5888
+
5889
+ - `thresholds=1` means any failure triggers a 'warning'
5890
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
5891
+
5892
+ Thresholds can be defined using one of these input schemes:
5893
+
5894
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5895
+ thresholds)
5896
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5897
+ the 'error' level, and position `2` is the 'critical' level
5898
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5899
+ 'critical'
5900
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5901
+ for the 'warning' level only
5902
+
5903
+ Examples
5904
+ --------
5905
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
5906
+ shown below:
5907
+
5908
+ ```python
5909
+ import pointblank as pb
5910
+ import polars as pl
5911
+
5912
+ tbl = pl.DataFrame(
5913
+ {
5914
+ "a": [1, 2, 3, 4, 5],
5915
+ "b": [2, 2, 2, 2, 2],
5916
+ }
5917
+ )
5918
+
5919
+ pb.preview(tbl)
5920
+ ```
5921
+
5922
+ Let's validate that the average of column `a` is greater than `3`:
5923
+
5924
+ ```python
5925
+ validation = (
5926
+ pb.Validate(data=tbl)
5927
+ .col_avg_gt(columns="a", value=3)
5928
+ .interrogate()
5929
+ )
5930
+
5931
+ validation
5932
+ ```
5933
+
5934
+ The validation result shows whether the average comparison passed or failed. Since this
5935
+ is an aggregation-based validation, there is exactly one test unit per column.
5936
+
5937
+ When validating multiple columns, each column gets its own validation step:
5938
+
5939
+ ```python
5940
+ validation = (
5941
+ pb.Validate(data=tbl)
5942
+ .col_avg_gt(columns=["a", "b"], value=3)
5943
+ .interrogate()
5944
+ )
5945
+
5946
+ validation
5947
+ ```
5948
+
5949
+ Using tolerance for flexible comparisons:
5950
+
5951
+ ```python
5952
+ validation = (
5953
+ pb.Validate(data=tbl)
5954
+ .col_avg_gt(columns="a", value=3, tol=1.0)
5955
+ .interrogate()
5956
+ )
5957
+
5958
+ validation
5959
+ ```
5960
+
5961
+ col_avg_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
5962
+ Does the column average satisfy a less than comparison?
5963
+
5964
+ The `col_avg_lt()` validation method checks whether the average of values in a column
5965
+ is less than a specified `value=`. This is an aggregation-based validation where the entire
5966
+ column is reduced to a single average value that is then compared against the target. The
5967
+ comparison used in this function is `average(column) < value`.
5968
+
5969
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
5970
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
5971
+ the comparison) or fails completely.
5972
+
5973
+ Parameters
5974
+ ----------
5975
+ columns
5976
+ A single column or a list of columns to validate. If multiple columns are supplied,
5977
+ there will be a separate validation step generated for each column. The columns must
5978
+ contain numeric data for the average to be computed.
5979
+ value
5980
+ The value to compare the column average against. This can be: (1) a numeric literal
5981
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
5982
+ whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
5983
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
5984
+ `None` to automatically compare against the same column in reference data (shorthand for
5985
+ `ref(column_name)` when reference data is set).
5986
+ tol
5987
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
5988
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
5989
+ a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
5990
+ `col_avg_lt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
5991
+ target value and still pass validation.
5992
+ thresholds
5993
+ Failure threshold levels so that the validation step can react accordingly when
5994
+ failing test units are level. Since this is an aggregation-based validation with only
5995
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
5996
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
5997
+ acceptable.
5998
+ brief
5999
+ An optional brief description of the validation step that will be displayed in the
6000
+ reporting table. You can use the templating elements like `"{step}"` to insert
6001
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6002
+ the entire brief will be automatically generated. If `None` (the default) then there
6003
+ won't be a brief.
6004
+ actions
6005
+ Optional actions to take when the validation step meets or exceeds any set threshold
6006
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6007
+ define the actions.
6008
+ active
6009
+ A boolean value indicating whether the validation step should be active. Using `False`
6010
+ will make the validation step inactive (still reporting its presence and keeping indexes
6011
+ for the steps unchanged).
6012
+
6013
+ Returns
6014
+ -------
6015
+ Validate
6016
+ The `Validate` object with the added validation step.
6017
+
6018
+ Using Reference Data
6019
+ --------------------
6020
+ The `col_avg_lt()` method supports comparing column aggregations against reference data. This
6021
+ is useful for validating that statistical properties remain consistent across different
6022
+ versions of a dataset, or for comparing current data against historical baselines.
6023
+
6024
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
6025
+
6026
+ ```python
6027
+ validation = (
6028
+ pb.Validate(data=current_data, reference=baseline_data)
6029
+ .col_avg_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
6030
+ .interrogate()
6031
+ )
6032
+ ```
6033
+
6034
+ When `value=None` and reference data is set, the method automatically compares against the
6035
+ same column in the reference data. You can also explicitly specify reference columns using
6036
+ the `ref()` helper:
6037
+
6038
+ ```python
6039
+ .col_avg_lt(columns="revenue", value=pb.ref("baseline_revenue"))
6040
+ ```
6041
+
6042
+ Understanding Tolerance
6043
+ -----------------------
6044
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
6045
+ floating-point aggregations where exact equality is often unreliable.
6046
+
6047
+ The `tol=` parameter expands the acceptable range for the comparison. For
6048
+ `col_avg_lt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
6049
+ target value and still pass validation.
6050
+
6051
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
6052
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
6053
+ shifts the comparison boundary.
6054
+
6055
+ Thresholds
6056
+ ----------
6057
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6058
+ step. If they are set here at the step level, these thresholds will override any thresholds
6059
+ set at the global level in `Validate(thresholds=...)`.
6060
+
6061
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
6062
+ validations operate on a single test unit (the aggregated value), threshold values are
6063
+ typically set as absolute counts:
6064
+
6065
+ - `thresholds=1` means any failure triggers a 'warning'
6066
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
6067
+
6068
+ Thresholds can be defined using one of these input schemes:
6069
+
6070
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6071
+ thresholds)
6072
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6073
+ the 'error' level, and position `2` is the 'critical' level
6074
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6075
+ 'critical'
6076
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6077
+ for the 'warning' level only
6078
+
6079
+ Examples
6080
+ --------
6081
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
6082
+ shown below:
6083
+
6084
+ ```python
6085
+ import pointblank as pb
6086
+ import polars as pl
6087
+
6088
+ tbl = pl.DataFrame(
6089
+ {
6090
+ "a": [1, 2, 3, 4, 5],
6091
+ "b": [2, 2, 2, 2, 2],
6092
+ }
6093
+ )
6094
+
6095
+ pb.preview(tbl)
6096
+ ```
6097
+
6098
+ Let's validate that the average of column `a` is less than `3`:
6099
+
6100
+ ```python
6101
+ validation = (
6102
+ pb.Validate(data=tbl)
6103
+ .col_avg_lt(columns="a", value=3)
6104
+ .interrogate()
6105
+ )
6106
+
6107
+ validation
6108
+ ```
6109
+
6110
+ The validation result shows whether the average comparison passed or failed. Since this
6111
+ is an aggregation-based validation, there is exactly one test unit per column.
6112
+
6113
+ When validating multiple columns, each column gets its own validation step:
6114
+
6115
+ ```python
6116
+ validation = (
6117
+ pb.Validate(data=tbl)
6118
+ .col_avg_lt(columns=["a", "b"], value=3)
6119
+ .interrogate()
6120
+ )
6121
+
6122
+ validation
6123
+ ```
6124
+
6125
+ Using tolerance for flexible comparisons:
6126
+
6127
+ ```python
6128
+ validation = (
6129
+ pb.Validate(data=tbl)
6130
+ .col_avg_lt(columns="a", value=3, tol=1.0)
6131
+ .interrogate()
6132
+ )
6133
+
6134
+ validation
6135
+ ```
6136
+
6137
+ col_avg_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
6138
+ Does the column average satisfy a greater than or equal to comparison?
6139
+
6140
+ The `col_avg_ge()` validation method checks whether the average of values in a column
6141
+ is at least a specified `value=`. This is an aggregation-based validation where the entire
6142
+ column is reduced to a single average value that is then compared against the target. The
6143
+ comparison used in this function is `average(column) >= value`.
6144
+
6145
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
6146
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
6147
+ the comparison) or fails completely.
6148
+
6149
+ Parameters
6150
+ ----------
6151
+ columns
6152
+ A single column or a list of columns to validate. If multiple columns are supplied,
6153
+ there will be a separate validation step generated for each column. The columns must
6154
+ contain numeric data for the average to be computed.
6155
+ value
6156
+ The value to compare the column average against. This can be: (1) a numeric literal
6157
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
6158
+ whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
6159
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
6160
+ `None` to automatically compare against the same column in reference data (shorthand for
6161
+ `ref(column_name)` when reference data is set).
6162
+ tol
6163
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
6164
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
6165
+ a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
6166
+ `col_avg_ge()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
6167
+ target value and still pass validation.
6168
+ thresholds
6169
+ Failure threshold levels so that the validation step can react accordingly when
6170
+ failing test units are level. Since this is an aggregation-based validation with only
6171
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
6172
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
6173
+ acceptable.
6174
+ brief
6175
+ An optional brief description of the validation step that will be displayed in the
6176
+ reporting table. You can use the templating elements like `"{step}"` to insert
6177
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6178
+ the entire brief will be automatically generated. If `None` (the default) then there
6179
+ won't be a brief.
6180
+ actions
6181
+ Optional actions to take when the validation step meets or exceeds any set threshold
6182
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6183
+ define the actions.
6184
+ active
6185
+ A boolean value indicating whether the validation step should be active. Using `False`
6186
+ will make the validation step inactive (still reporting its presence and keeping indexes
6187
+ for the steps unchanged).
6188
+
6189
+ Returns
6190
+ -------
6191
+ Validate
6192
+ The `Validate` object with the added validation step.
6193
+
6194
+ Using Reference Data
6195
+ --------------------
6196
+ The `col_avg_ge()` method supports comparing column aggregations against reference data. This
6197
+ is useful for validating that statistical properties remain consistent across different
6198
+ versions of a dataset, or for comparing current data against historical baselines.
6199
+
6200
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
6201
+
6202
+ ```python
6203
+ validation = (
6204
+ pb.Validate(data=current_data, reference=baseline_data)
6205
+ .col_avg_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
6206
+ .interrogate()
6207
+ )
6208
+ ```
6209
+
6210
+ When `value=None` and reference data is set, the method automatically compares against the
6211
+ same column in the reference data. You can also explicitly specify reference columns using
6212
+ the `ref()` helper:
6213
+
6214
+ ```python
6215
+ .col_avg_ge(columns="revenue", value=pb.ref("baseline_revenue"))
6216
+ ```
6217
+
6218
+ Understanding Tolerance
6219
+ -----------------------
6220
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
6221
+ floating-point aggregations where exact equality is often unreliable.
6222
+
6223
+ The `tol=` parameter expands the acceptable range for the comparison. For
6224
+ `col_avg_ge()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
6225
+ target value and still pass validation.
6226
+
6227
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
6228
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
6229
+ shifts the comparison boundary.
6230
+
6231
+ Thresholds
6232
+ ----------
6233
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6234
+ step. If they are set here at the step level, these thresholds will override any thresholds
6235
+ set at the global level in `Validate(thresholds=...)`.
6236
+
6237
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
6238
+ validations operate on a single test unit (the aggregated value), threshold values are
6239
+ typically set as absolute counts:
6240
+
6241
+ - `thresholds=1` means any failure triggers a 'warning'
6242
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
6243
+
6244
+ Thresholds can be defined using one of these input schemes:
6245
+
6246
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6247
+ thresholds)
6248
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6249
+ the 'error' level, and position `2` is the 'critical' level
6250
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6251
+ 'critical'
6252
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6253
+ for the 'warning' level only
6254
+
6255
+ Examples
6256
+ --------
6257
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
6258
+ shown below:
6259
+
6260
+ ```python
6261
+ import pointblank as pb
6262
+ import polars as pl
6263
+
6264
+ tbl = pl.DataFrame(
6265
+ {
6266
+ "a": [1, 2, 3, 4, 5],
6267
+ "b": [2, 2, 2, 2, 2],
6268
+ }
6269
+ )
6270
+
6271
+ pb.preview(tbl)
6272
+ ```
6273
+
6274
+ Let's validate that the average of column `a` is at least `3`:
6275
+
6276
+ ```python
6277
+ validation = (
6278
+ pb.Validate(data=tbl)
6279
+ .col_avg_ge(columns="a", value=3)
6280
+ .interrogate()
6281
+ )
6282
+
6283
+ validation
6284
+ ```
6285
+
6286
+ The validation result shows whether the average comparison passed or failed. Since this
6287
+ is an aggregation-based validation, there is exactly one test unit per column.
6288
+
6289
+ When validating multiple columns, each column gets its own validation step:
6290
+
6291
+ ```python
6292
+ validation = (
6293
+ pb.Validate(data=tbl)
6294
+ .col_avg_ge(columns=["a", "b"], value=3)
6295
+ .interrogate()
6296
+ )
6297
+
6298
+ validation
6299
+ ```
6300
+
6301
+ Using tolerance for flexible comparisons:
6302
+
6303
+ ```python
6304
+ validation = (
6305
+ pb.Validate(data=tbl)
6306
+ .col_avg_ge(columns="a", value=3, tol=1.0)
6307
+ .interrogate()
6308
+ )
6309
+
6310
+ validation
6311
+ ```
6312
+
6313
+ col_avg_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
6314
+ Does the column average satisfy a less than or equal to comparison?
6315
+
6316
+ The `col_avg_le()` validation method checks whether the average of values in a column
6317
+ is at most a specified `value=`. This is an aggregation-based validation where the entire
6318
+ column is reduced to a single average value that is then compared against the target. The
6319
+ comparison used in this function is `average(column) <= value`.
6320
+
6321
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
6322
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
6323
+ the comparison) or fails completely.
6324
+
6325
+ Parameters
6326
+ ----------
6327
+ columns
6328
+ A single column or a list of columns to validate. If multiple columns are supplied,
6329
+ there will be a separate validation step generated for each column. The columns must
6330
+ contain numeric data for the average to be computed.
6331
+ value
6332
+ The value to compare the column average against. This can be: (1) a numeric literal
6333
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
6334
+ whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
6335
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
6336
+ `None` to automatically compare against the same column in reference data (shorthand for
6337
+ `ref(column_name)` when reference data is set).
6338
+ tol
6339
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
6340
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
6341
+ a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
6342
+ `col_avg_le()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
6343
+ target value and still pass validation.
6344
+ thresholds
6345
+ Failure threshold levels so that the validation step can react accordingly when
6346
+ failing test units are level. Since this is an aggregation-based validation with only
6347
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
6348
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
6349
+ acceptable.
6350
+ brief
6351
+ An optional brief description of the validation step that will be displayed in the
6352
+ reporting table. You can use the templating elements like `"{step}"` to insert
6353
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6354
+ the entire brief will be automatically generated. If `None` (the default) then there
6355
+ won't be a brief.
6356
+ actions
6357
+ Optional actions to take when the validation step meets or exceeds any set threshold
6358
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6359
+ define the actions.
6360
+ active
6361
+ A boolean value indicating whether the validation step should be active. Using `False`
6362
+ will make the validation step inactive (still reporting its presence and keeping indexes
6363
+ for the steps unchanged).
6364
+
6365
+ Returns
6366
+ -------
6367
+ Validate
6368
+ The `Validate` object with the added validation step.
6369
+
6370
+ Using Reference Data
6371
+ --------------------
6372
+ The `col_avg_le()` method supports comparing column aggregations against reference data. This
6373
+ is useful for validating that statistical properties remain consistent across different
6374
+ versions of a dataset, or for comparing current data against historical baselines.
6375
+
6376
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
6377
+
6378
+ ```python
6379
+ validation = (
6380
+ pb.Validate(data=current_data, reference=baseline_data)
6381
+ .col_avg_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
6382
+ .interrogate()
6383
+ )
6384
+ ```
6385
+
6386
+ When `value=None` and reference data is set, the method automatically compares against the
6387
+ same column in the reference data. You can also explicitly specify reference columns using
6388
+ the `ref()` helper:
6389
+
6390
+ ```python
6391
+ .col_avg_le(columns="revenue", value=pb.ref("baseline_revenue"))
6392
+ ```
6393
+
6394
+ Understanding Tolerance
6395
+ -----------------------
6396
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
6397
+ floating-point aggregations where exact equality is often unreliable.
6398
+
6399
+ The `tol=` parameter expands the acceptable range for the comparison. For
6400
+ `col_avg_le()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
6401
+ target value and still pass validation.
6402
+
6403
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
6404
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
6405
+ shifts the comparison boundary.
6406
+
6407
+ Thresholds
6408
+ ----------
6409
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6410
+ step. If they are set here at the step level, these thresholds will override any thresholds
6411
+ set at the global level in `Validate(thresholds=...)`.
6412
+
6413
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
6414
+ validations operate on a single test unit (the aggregated value), threshold values are
6415
+ typically set as absolute counts:
6416
+
6417
+ - `thresholds=1` means any failure triggers a 'warning'
6418
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
6419
+
6420
+ Thresholds can be defined using one of these input schemes:
6421
+
6422
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6423
+ thresholds)
6424
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6425
+ the 'error' level, and position `2` is the 'critical' level
6426
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6427
+ 'critical'
6428
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6429
+ for the 'warning' level only
6430
+
6431
+ Examples
6432
+ --------
6433
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
6434
+ shown below:
6435
+
6436
+ ```python
6437
+ import pointblank as pb
6438
+ import polars as pl
6439
+
6440
+ tbl = pl.DataFrame(
6441
+ {
6442
+ "a": [1, 2, 3, 4, 5],
6443
+ "b": [2, 2, 2, 2, 2],
6444
+ }
6445
+ )
6446
+
6447
+ pb.preview(tbl)
6448
+ ```
6449
+
6450
+ Let's validate that the average of column `a` is at most `3`:
6451
+
6452
+ ```python
6453
+ validation = (
6454
+ pb.Validate(data=tbl)
6455
+ .col_avg_le(columns="a", value=3)
6456
+ .interrogate()
6457
+ )
6458
+
6459
+ validation
6460
+ ```
6461
+
6462
+ The validation result shows whether the average comparison passed or failed. Since this
6463
+ is an aggregation-based validation, there is exactly one test unit per column.
6464
+
6465
+ When validating multiple columns, each column gets its own validation step:
6466
+
6467
+ ```python
6468
+ validation = (
6469
+ pb.Validate(data=tbl)
6470
+ .col_avg_le(columns=["a", "b"], value=3)
6471
+ .interrogate()
6472
+ )
6473
+
6474
+ validation
6475
+ ```
6476
+
6477
+ Using tolerance for flexible comparisons:
6478
+
6479
+ ```python
6480
+ validation = (
6481
+ pb.Validate(data=tbl)
6482
+ .col_avg_le(columns="a", value=3, tol=1.0)
6483
+ .interrogate()
6484
+ )
6485
+
6486
+ validation
6487
+ ```
6488
+
6489
+ col_avg_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
6490
+ Does the column average satisfy an equal to comparison?
6491
+
6492
+ The `col_avg_eq()` validation method checks whether the average of values in a column
6493
+ equals a specified `value=`. This is an aggregation-based validation where the entire
6494
+ column is reduced to a single average value that is then compared against the target. The
6495
+ comparison used in this function is `average(column) == value`.
6496
+
6497
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
6498
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
6499
+ the comparison) or fails completely.
6500
+
6501
+ Parameters
6502
+ ----------
6503
+ columns
6504
+ A single column or a list of columns to validate. If multiple columns are supplied,
6505
+ there will be a separate validation step generated for each column. The columns must
6506
+ contain numeric data for the average to be computed.
6507
+ value
6508
+ The value to compare the column average against. This can be: (1) a numeric literal
6509
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
6510
+ whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
6511
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
6512
+ `None` to automatically compare against the same column in reference data (shorthand for
6513
+ `ref(column_name)` when reference data is set).
6514
+ tol
6515
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
6516
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
6517
+ a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_avg_eq()` since exact equality
6518
+ comparisons on floating-point aggregations can be problematic due to numerical precision.
6519
+ Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
6520
+ floating-point arithmetic.
6521
+ thresholds
6522
+ Failure threshold levels so that the validation step can react accordingly when
6523
+ failing test units are level. Since this is an aggregation-based validation with only
6524
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
6525
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
6526
+ acceptable.
6527
+ brief
6528
+ An optional brief description of the validation step that will be displayed in the
6529
+ reporting table. You can use the templating elements like `"{step}"` to insert
6530
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6531
+ the entire brief will be automatically generated. If `None` (the default) then there
6532
+ won't be a brief.
6533
+ actions
6534
+ Optional actions to take when the validation step meets or exceeds any set threshold
6535
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6536
+ define the actions.
6537
+ active
6538
+ A boolean value indicating whether the validation step should be active. Using `False`
6539
+ will make the validation step inactive (still reporting its presence and keeping indexes
6540
+ for the steps unchanged).
6541
+
6542
+ Returns
6543
+ -------
6544
+ Validate
6545
+ The `Validate` object with the added validation step.
6546
+
6547
+ Using Reference Data
6548
+ --------------------
6549
+ The `col_avg_eq()` method supports comparing column aggregations against reference data. This
6550
+ is useful for validating that statistical properties remain consistent across different
6551
+ versions of a dataset, or for comparing current data against historical baselines.
6552
+
6553
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
6554
+
6555
+ ```python
6556
+ validation = (
6557
+ pb.Validate(data=current_data, reference=baseline_data)
6558
+ .col_avg_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
6559
+ .interrogate()
6560
+ )
6561
+ ```
6562
+
6563
+ When `value=None` and reference data is set, the method automatically compares against the
6564
+ same column in the reference data. You can also explicitly specify reference columns using
6565
+ the `ref()` helper:
6566
+
6567
+ ```python
6568
+ .col_avg_eq(columns="revenue", value=pb.ref("baseline_revenue"))
6569
+ ```
6570
+
6571
+ Understanding Tolerance
6572
+ -----------------------
6573
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
6574
+ floating-point aggregations where exact equality is often unreliable.
6575
+
6576
+ The `tol=` parameter is particularly useful with `col_avg_eq()` since exact equality
6577
+ comparisons on floating-point aggregations can be problematic due to numerical precision.
6578
+ Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
6579
+ floating-point arithmetic.
6580
+
6581
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
6582
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
6583
+ shifts the comparison boundary.
6584
+
6585
+ Thresholds
6586
+ ----------
6587
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6588
+ step. If they are set here at the step level, these thresholds will override any thresholds
6589
+ set at the global level in `Validate(thresholds=...)`.
6590
+
6591
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
6592
+ validations operate on a single test unit (the aggregated value), threshold values are
6593
+ typically set as absolute counts:
6594
+
6595
+ - `thresholds=1` means any failure triggers a 'warning'
6596
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
6597
+
6598
+ Thresholds can be defined using one of these input schemes:
6599
+
6600
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6601
+ thresholds)
6602
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6603
+ the 'error' level, and position `2` is the 'critical' level
6604
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6605
+ 'critical'
6606
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6607
+ for the 'warning' level only
6608
+
6609
+ Examples
6610
+ --------
6611
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
6612
+ shown below:
6613
+
6614
+ ```python
6615
+ import pointblank as pb
6616
+ import polars as pl
6617
+
6618
+ tbl = pl.DataFrame(
6619
+ {
6620
+ "a": [1, 2, 3, 4, 5],
6621
+ "b": [2, 2, 2, 2, 2],
6622
+ }
6623
+ )
6624
+
6625
+ pb.preview(tbl)
6626
+ ```
6627
+
6628
+ Let's validate that the average of column `a` equals `3`:
6629
+
6630
+ ```python
6631
+ validation = (
6632
+ pb.Validate(data=tbl)
6633
+ .col_avg_eq(columns="a", value=3)
6634
+ .interrogate()
6635
+ )
6636
+
6637
+ validation
6638
+ ```
6639
+
6640
+ The validation result shows whether the average comparison passed or failed. Since this
6641
+ is an aggregation-based validation, there is exactly one test unit per column.
6642
+
6643
+ When validating multiple columns, each column gets its own validation step:
6644
+
6645
+ ```python
6646
+ validation = (
6647
+ pb.Validate(data=tbl)
6648
+ .col_avg_eq(columns=["a", "b"], value=3)
6649
+ .interrogate()
6650
+ )
6651
+
6652
+ validation
6653
+ ```
6654
+
6655
+ Using tolerance for flexible comparisons:
6656
+
6657
+ ```python
6658
+ validation = (
6659
+ pb.Validate(data=tbl)
6660
+ .col_avg_eq(columns="a", value=3, tol=1.0)
6661
+ .interrogate()
6662
+ )
6663
+
6664
+ validation
6665
+ ```
6666
+
6667
+ col_sd_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
6668
+ Does the column standard deviation satisfy a greater than comparison?
6669
+
6670
+ The `col_sd_gt()` validation method checks whether the standard deviation of values in a column
6671
+ is greater than a specified `value=`. This is an aggregation-based validation where the entire
6672
+ column is reduced to a single standard deviation value that is then compared against the target. The
6673
+ comparison used in this function is `standard deviation(column) > value`.
6674
+
6675
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
6676
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
6677
+ the comparison) or fails completely.
6678
+
6679
+ Parameters
6680
+ ----------
6681
+ columns
6682
+ A single column or a list of columns to validate. If multiple columns are supplied,
6683
+ there will be a separate validation step generated for each column. The columns must
6684
+ contain numeric data for the standard deviation to be computed.
6685
+ value
6686
+ The value to compare the column standard deviation against. This can be: (1) a numeric literal
6687
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
6688
+ whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
6689
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
6690
+ `None` to automatically compare against the same column in reference data (shorthand for
6691
+ `ref(column_name)` when reference data is set).
6692
+ tol
6693
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
6694
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
6695
+ a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
6696
+ `col_sd_gt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
6697
+ target value and still pass validation.
6698
+ thresholds
6699
+ Failure threshold levels so that the validation step can react accordingly when
6700
+ failing test units are level. Since this is an aggregation-based validation with only
6701
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
6702
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
6703
+ acceptable.
6704
+ brief
6705
+ An optional brief description of the validation step that will be displayed in the
6706
+ reporting table. You can use the templating elements like `"{step}"` to insert
6707
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6708
+ the entire brief will be automatically generated. If `None` (the default) then there
6709
+ won't be a brief.
6710
+ actions
6711
+ Optional actions to take when the validation step meets or exceeds any set threshold
6712
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6713
+ define the actions.
6714
+ active
6715
+ A boolean value indicating whether the validation step should be active. Using `False`
6716
+ will make the validation step inactive (still reporting its presence and keeping indexes
6717
+ for the steps unchanged).
6718
+
6719
+ Returns
6720
+ -------
6721
+ Validate
6722
+ The `Validate` object with the added validation step.
6723
+
6724
+ Using Reference Data
6725
+ --------------------
6726
+ The `col_sd_gt()` method supports comparing column aggregations against reference data. This
6727
+ is useful for validating that statistical properties remain consistent across different
6728
+ versions of a dataset, or for comparing current data against historical baselines.
6729
+
6730
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
6731
+
6732
+ ```python
6733
+ validation = (
6734
+ pb.Validate(data=current_data, reference=baseline_data)
6735
+ .col_sd_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
6736
+ .interrogate()
6737
+ )
6738
+ ```
6739
+
6740
+ When `value=None` and reference data is set, the method automatically compares against the
6741
+ same column in the reference data. You can also explicitly specify reference columns using
6742
+ the `ref()` helper:
6743
+
6744
+ ```python
6745
+ .col_sd_gt(columns="revenue", value=pb.ref("baseline_revenue"))
6746
+ ```
6747
+
6748
+ Understanding Tolerance
6749
+ -----------------------
6750
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
6751
+ floating-point aggregations where exact equality is often unreliable.
6752
+
6753
+ The `tol=` parameter expands the acceptable range for the comparison. For
6754
+ `col_sd_gt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
6755
+ target value and still pass validation.
6756
+
6757
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
6758
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
6759
+ shifts the comparison boundary.
6760
+
6761
+ Thresholds
6762
+ ----------
6763
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6764
+ step. If they are set here at the step level, these thresholds will override any thresholds
6765
+ set at the global level in `Validate(thresholds=...)`.
6766
+
6767
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
6768
+ validations operate on a single test unit (the aggregated value), threshold values are
6769
+ typically set as absolute counts:
6770
+
6771
+ - `thresholds=1` means any failure triggers a 'warning'
6772
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
6773
+
6774
+ Thresholds can be defined using one of these input schemes:
6775
+
6776
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6777
+ thresholds)
6778
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6779
+ the 'error' level, and position `2` is the 'critical' level
6780
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6781
+ 'critical'
6782
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6783
+ for the 'warning' level only
6784
+
6785
+ Examples
6786
+ --------
6787
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
6788
+ shown below:
6789
+
6790
+ ```python
6791
+ import pointblank as pb
6792
+ import polars as pl
6793
+
6794
+ tbl = pl.DataFrame(
6795
+ {
6796
+ "a": [1, 2, 3, 4, 5],
6797
+ "b": [2, 2, 2, 2, 2],
6798
+ }
6799
+ )
6800
+
6801
+ pb.preview(tbl)
6802
+ ```
6803
+
6804
+ Let's validate that the standard deviation of column `a` is greater than `2`:
6805
+
6806
+ ```python
6807
+ validation = (
6808
+ pb.Validate(data=tbl)
6809
+ .col_sd_gt(columns="a", value=2)
6810
+ .interrogate()
6811
+ )
6812
+
6813
+ validation
6814
+ ```
6815
+
6816
+ The validation result shows whether the standard deviation comparison passed or failed. Since this
6817
+ is an aggregation-based validation, there is exactly one test unit per column.
6818
+
6819
+ When validating multiple columns, each column gets its own validation step:
6820
+
6821
+ ```python
6822
+ validation = (
6823
+ pb.Validate(data=tbl)
6824
+ .col_sd_gt(columns=["a", "b"], value=2)
6825
+ .interrogate()
6826
+ )
6827
+
6828
+ validation
6829
+ ```
6830
+
6831
+ Using tolerance for flexible comparisons:
6832
+
6833
+ ```python
6834
+ validation = (
6835
+ pb.Validate(data=tbl)
6836
+ .col_sd_gt(columns="a", value=2, tol=1.0)
6837
+ .interrogate()
6838
+ )
6839
+
6840
+ validation
6841
+ ```
6842
+
6843
+ col_sd_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
6844
+ Does the column standard deviation satisfy a less than comparison?
6845
+
6846
+ The `col_sd_lt()` validation method checks whether the standard deviation of values in a column
6847
+ is less than a specified `value=`. This is an aggregation-based validation where the entire
6848
+ column is reduced to a single standard deviation value that is then compared against the target. The
6849
+ comparison used in this function is `standard deviation(column) < value`.
6850
+
6851
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
6852
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
6853
+ the comparison) or fails completely.
6854
+
6855
+ Parameters
6856
+ ----------
6857
+ columns
6858
+ A single column or a list of columns to validate. If multiple columns are supplied,
6859
+ there will be a separate validation step generated for each column. The columns must
6860
+ contain numeric data for the standard deviation to be computed.
6861
+ value
6862
+ The value to compare the column standard deviation against. This can be: (1) a numeric literal
6863
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
6864
+ whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
6865
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
6866
+ `None` to automatically compare against the same column in reference data (shorthand for
6867
+ `ref(column_name)` when reference data is set).
6868
+ tol
6869
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
6870
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
6871
+ a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
6872
+ `col_sd_lt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
6873
+ target value and still pass validation.
6874
+ thresholds
6875
+ Failure threshold levels so that the validation step can react accordingly when
6876
+ failing test units are level. Since this is an aggregation-based validation with only
6877
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
6878
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
6879
+ acceptable.
6880
+ brief
6881
+ An optional brief description of the validation step that will be displayed in the
6882
+ reporting table. You can use the templating elements like `"{step}"` to insert
6883
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6884
+ the entire brief will be automatically generated. If `None` (the default) then there
6885
+ won't be a brief.
6886
+ actions
6887
+ Optional actions to take when the validation step meets or exceeds any set threshold
6888
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6889
+ define the actions.
6890
+ active
6891
+ A boolean value indicating whether the validation step should be active. Using `False`
6892
+ will make the validation step inactive (still reporting its presence and keeping indexes
6893
+ for the steps unchanged).
6894
+
6895
+ Returns
6896
+ -------
6897
+ Validate
6898
+ The `Validate` object with the added validation step.
6899
+
6900
+ Using Reference Data
6901
+ --------------------
6902
+ The `col_sd_lt()` method supports comparing column aggregations against reference data. This
6903
+ is useful for validating that statistical properties remain consistent across different
6904
+ versions of a dataset, or for comparing current data against historical baselines.
6905
+
6906
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
6907
+
6908
+ ```python
6909
+ validation = (
6910
+ pb.Validate(data=current_data, reference=baseline_data)
6911
+ .col_sd_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
6912
+ .interrogate()
6913
+ )
6914
+ ```
6915
+
6916
+ When `value=None` and reference data is set, the method automatically compares against the
6917
+ same column in the reference data. You can also explicitly specify reference columns using
6918
+ the `ref()` helper:
6919
+
6920
+ ```python
6921
+ .col_sd_lt(columns="revenue", value=pb.ref("baseline_revenue"))
6922
+ ```
6923
+
6924
+ Understanding Tolerance
6925
+ -----------------------
6926
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
6927
+ floating-point aggregations where exact equality is often unreliable.
6928
+
6929
+ The `tol=` parameter expands the acceptable range for the comparison. For
6930
+ `col_sd_lt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
6931
+ target value and still pass validation.
6932
+
6933
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
6934
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
6935
+ shifts the comparison boundary.
6936
+
6937
+ Thresholds
6938
+ ----------
6939
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6940
+ step. If they are set here at the step level, these thresholds will override any thresholds
6941
+ set at the global level in `Validate(thresholds=...)`.
6942
+
6943
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
6944
+ validations operate on a single test unit (the aggregated value), threshold values are
6945
+ typically set as absolute counts:
6946
+
6947
+ - `thresholds=1` means any failure triggers a 'warning'
6948
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
6949
+
6950
+ Thresholds can be defined using one of these input schemes:
6951
+
6952
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6953
+ thresholds)
6954
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6955
+ the 'error' level, and position `2` is the 'critical' level
6956
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6957
+ 'critical'
6958
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6959
+ for the 'warning' level only
6960
+
6961
+ Examples
6962
+ --------
6963
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
6964
+ shown below:
6965
+
6966
+ ```python
6967
+ import pointblank as pb
6968
+ import polars as pl
6969
+
6970
+ tbl = pl.DataFrame(
6971
+ {
6972
+ "a": [1, 2, 3, 4, 5],
6973
+ "b": [2, 2, 2, 2, 2],
6974
+ }
6975
+ )
6976
+
6977
+ pb.preview(tbl)
6978
+ ```
6979
+
6980
+ Let's validate that the standard deviation of column `a` is less than `2`:
6981
+
6982
+ ```python
6983
+ validation = (
6984
+ pb.Validate(data=tbl)
6985
+ .col_sd_lt(columns="a", value=2)
6986
+ .interrogate()
6987
+ )
6988
+
6989
+ validation
6990
+ ```
6991
+
6992
+ The validation result shows whether the standard deviation comparison passed or failed. Since this
6993
+ is an aggregation-based validation, there is exactly one test unit per column.
6994
+
6995
+ When validating multiple columns, each column gets its own validation step:
6996
+
6997
+ ```python
6998
+ validation = (
6999
+ pb.Validate(data=tbl)
7000
+ .col_sd_lt(columns=["a", "b"], value=2)
7001
+ .interrogate()
7002
+ )
7003
+
7004
+ validation
7005
+ ```
7006
+
7007
+ Using tolerance for flexible comparisons:
7008
+
7009
+ ```python
7010
+ validation = (
7011
+ pb.Validate(data=tbl)
7012
+ .col_sd_lt(columns="a", value=2, tol=1.0)
7013
+ .interrogate()
7014
+ )
7015
+
7016
+ validation
7017
+ ```
7018
+
7019
+ col_sd_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
7020
+ Does the column standard deviation satisfy a greater than or equal to comparison?
7021
+
7022
+ The `col_sd_ge()` validation method checks whether the standard deviation of values in a column
7023
+ is at least a specified `value=`. This is an aggregation-based validation where the entire
7024
+ column is reduced to a single standard deviation value that is then compared against the target. The
7025
+ comparison used in this function is `standard deviation(column) >= value`.
7026
+
7027
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
7028
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
7029
+ the comparison) or fails completely.
7030
+
7031
+ Parameters
7032
+ ----------
7033
+ columns
7034
+ A single column or a list of columns to validate. If multiple columns are supplied,
7035
+ there will be a separate validation step generated for each column. The columns must
7036
+ contain numeric data for the standard deviation to be computed.
7037
+ value
7038
+ The value to compare the column standard deviation against. This can be: (1) a numeric literal
7039
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
7040
+ whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
7041
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
7042
+ `None` to automatically compare against the same column in reference data (shorthand for
7043
+ `ref(column_name)` when reference data is set).
7044
+ tol
7045
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
7046
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
7047
+ a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
7048
+ `col_sd_ge()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
7049
+ target value and still pass validation.
7050
+ thresholds
7051
+ Failure threshold levels so that the validation step can react accordingly when
7052
+ failing test units are level. Since this is an aggregation-based validation with only
7053
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
7054
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
7055
+ acceptable.
7056
+ brief
7057
+ An optional brief description of the validation step that will be displayed in the
7058
+ reporting table. You can use the templating elements like `"{step}"` to insert
7059
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
7060
+ the entire brief will be automatically generated. If `None` (the default) then there
7061
+ won't be a brief.
7062
+ actions
7063
+ Optional actions to take when the validation step meets or exceeds any set threshold
7064
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
7065
+ define the actions.
7066
+ active
7067
+ A boolean value indicating whether the validation step should be active. Using `False`
7068
+ will make the validation step inactive (still reporting its presence and keeping indexes
7069
+ for the steps unchanged).
7070
+
7071
+ Returns
7072
+ -------
7073
+ Validate
7074
+ The `Validate` object with the added validation step.
7075
+
7076
+ Using Reference Data
7077
+ --------------------
7078
+ The `col_sd_ge()` method supports comparing column aggregations against reference data. This
7079
+ is useful for validating that statistical properties remain consistent across different
7080
+ versions of a dataset, or for comparing current data against historical baselines.
7081
+
7082
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
7083
+
7084
+ ```python
7085
+ validation = (
7086
+ pb.Validate(data=current_data, reference=baseline_data)
7087
+ .col_sd_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
7088
+ .interrogate()
7089
+ )
7090
+ ```
7091
+
7092
+ When `value=None` and reference data is set, the method automatically compares against the
7093
+ same column in the reference data. You can also explicitly specify reference columns using
7094
+ the `ref()` helper:
7095
+
7096
+ ```python
7097
+ .col_sd_ge(columns="revenue", value=pb.ref("baseline_revenue"))
7098
+ ```
7099
+
7100
+ Understanding Tolerance
7101
+ -----------------------
7102
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
7103
+ floating-point aggregations where exact equality is often unreliable.
7104
+
7105
+ The `tol=` parameter expands the acceptable range for the comparison. For
7106
+ `col_sd_ge()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
7107
+ target value and still pass validation.
7108
+
7109
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
7110
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
7111
+ shifts the comparison boundary.
7112
+
7113
+ Thresholds
7114
+ ----------
7115
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
7116
+ step. If they are set here at the step level, these thresholds will override any thresholds
7117
+ set at the global level in `Validate(thresholds=...)`.
7118
+
7119
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
7120
+ validations operate on a single test unit (the aggregated value), threshold values are
7121
+ typically set as absolute counts:
7122
+
7123
+ - `thresholds=1` means any failure triggers a 'warning'
7124
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
7125
+
7126
+ Thresholds can be defined using one of these input schemes:
7127
+
7128
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
7129
+ thresholds)
7130
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
7131
+ the 'error' level, and position `2` is the 'critical' level
7132
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
7133
+ 'critical'
7134
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
7135
+ for the 'warning' level only
7136
+
7137
+ Examples
7138
+ --------
7139
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
7140
+ shown below:
7141
+
7142
+ ```python
7143
+ import pointblank as pb
7144
+ import polars as pl
7145
+
7146
+ tbl = pl.DataFrame(
7147
+ {
7148
+ "a": [1, 2, 3, 4, 5],
7149
+ "b": [2, 2, 2, 2, 2],
7150
+ }
7151
+ )
7152
+
7153
+ pb.preview(tbl)
7154
+ ```
7155
+
7156
+ Let's validate that the standard deviation of column `a` is at least `2`:
7157
+
7158
+ ```python
7159
+ validation = (
7160
+ pb.Validate(data=tbl)
7161
+ .col_sd_ge(columns="a", value=2)
7162
+ .interrogate()
7163
+ )
7164
+
7165
+ validation
7166
+ ```
7167
+
7168
+ The validation result shows whether the standard deviation comparison passed or failed. Since this
7169
+ is an aggregation-based validation, there is exactly one test unit per column.
7170
+
7171
+ When validating multiple columns, each column gets its own validation step:
7172
+
7173
+ ```python
7174
+ validation = (
7175
+ pb.Validate(data=tbl)
7176
+ .col_sd_ge(columns=["a", "b"], value=2)
7177
+ .interrogate()
7178
+ )
7179
+
7180
+ validation
7181
+ ```
7182
+
7183
+ Using tolerance for flexible comparisons:
7184
+
7185
+ ```python
7186
+ validation = (
7187
+ pb.Validate(data=tbl)
7188
+ .col_sd_ge(columns="a", value=2, tol=1.0)
7189
+ .interrogate()
7190
+ )
7191
+
7192
+ validation
7193
+ ```
7194
+
7195
+ col_sd_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
7196
+ Does the column standard deviation satisfy a less than or equal to comparison?
7197
+
7198
+ The `col_sd_le()` validation method checks whether the standard deviation of values in a column
7199
+ is at most a specified `value=`. This is an aggregation-based validation where the entire
7200
+ column is reduced to a single standard deviation value that is then compared against the target. The
7201
+ comparison used in this function is `standard deviation(column) <= value`.
7202
+
7203
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
7204
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
7205
+ the comparison) or fails completely.
7206
+
7207
+ Parameters
7208
+ ----------
7209
+ columns
7210
+ A single column or a list of columns to validate. If multiple columns are supplied,
7211
+ there will be a separate validation step generated for each column. The columns must
7212
+ contain numeric data for the standard deviation to be computed.
7213
+ value
7214
+ The value to compare the column standard deviation against. This can be: (1) a numeric literal
7215
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
7216
+ whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
7217
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
7218
+ `None` to automatically compare against the same column in reference data (shorthand for
7219
+ `ref(column_name)` when reference data is set).
7220
+ tol
7221
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
7222
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
7223
+ a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
7224
+ `col_sd_le()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
7225
+ target value and still pass validation.
7226
+ thresholds
7227
+ Failure threshold levels so that the validation step can react accordingly when
7228
+ failing test units are level. Since this is an aggregation-based validation with only
7229
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
7230
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
7231
+ acceptable.
7232
+ brief
7233
+ An optional brief description of the validation step that will be displayed in the
7234
+ reporting table. You can use the templating elements like `"{step}"` to insert
7235
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
7236
+ the entire brief will be automatically generated. If `None` (the default) then there
7237
+ won't be a brief.
7238
+ actions
7239
+ Optional actions to take when the validation step meets or exceeds any set threshold
7240
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
7241
+ define the actions.
7242
+ active
7243
+ A boolean value indicating whether the validation step should be active. Using `False`
7244
+ will make the validation step inactive (still reporting its presence and keeping indexes
7245
+ for the steps unchanged).
7246
+
7247
+ Returns
7248
+ -------
7249
+ Validate
7250
+ The `Validate` object with the added validation step.
7251
+
7252
+ Using Reference Data
7253
+ --------------------
7254
+ The `col_sd_le()` method supports comparing column aggregations against reference data. This
7255
+ is useful for validating that statistical properties remain consistent across different
7256
+ versions of a dataset, or for comparing current data against historical baselines.
7257
+
7258
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
7259
+
7260
+ ```python
7261
+ validation = (
7262
+ pb.Validate(data=current_data, reference=baseline_data)
7263
+ .col_sd_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
7264
+ .interrogate()
7265
+ )
7266
+ ```
7267
+
7268
+ When `value=None` and reference data is set, the method automatically compares against the
7269
+ same column in the reference data. You can also explicitly specify reference columns using
7270
+ the `ref()` helper:
7271
+
7272
+ ```python
7273
+ .col_sd_le(columns="revenue", value=pb.ref("baseline_revenue"))
7274
+ ```
7275
+
7276
+ Understanding Tolerance
7277
+ -----------------------
7278
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
7279
+ floating-point aggregations where exact equality is often unreliable.
7280
+
7281
+ The `tol=` parameter expands the acceptable range for the comparison. For
7282
+ `col_sd_le()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
7283
+ target value and still pass validation.
7284
+
7285
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
7286
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
7287
+ shifts the comparison boundary.
7288
+
7289
+ Thresholds
7290
+ ----------
7291
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
7292
+ step. If they are set here at the step level, these thresholds will override any thresholds
7293
+ set at the global level in `Validate(thresholds=...)`.
7294
+
7295
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
7296
+ validations operate on a single test unit (the aggregated value), threshold values are
7297
+ typically set as absolute counts:
7298
+
7299
+ - `thresholds=1` means any failure triggers a 'warning'
7300
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
7301
+
7302
+ Thresholds can be defined using one of these input schemes:
7303
+
7304
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
7305
+ thresholds)
7306
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
7307
+ the 'error' level, and position `2` is the 'critical' level
7308
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
7309
+ 'critical'
7310
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
7311
+ for the 'warning' level only
7312
+
7313
+ Examples
7314
+ --------
7315
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
7316
+ shown below:
7317
+
7318
+ ```python
7319
+ import pointblank as pb
7320
+ import polars as pl
7321
+
7322
+ tbl = pl.DataFrame(
7323
+ {
7324
+ "a": [1, 2, 3, 4, 5],
7325
+ "b": [2, 2, 2, 2, 2],
7326
+ }
7327
+ )
7328
+
7329
+ pb.preview(tbl)
7330
+ ```
7331
+
7332
+ Let's validate that the standard deviation of column `a` is at most `2`:
7333
+
7334
+ ```python
7335
+ validation = (
7336
+ pb.Validate(data=tbl)
7337
+ .col_sd_le(columns="a", value=2)
7338
+ .interrogate()
7339
+ )
7340
+
7341
+ validation
7342
+ ```
7343
+
7344
+ The validation result shows whether the standard deviation comparison passed or failed. Since this
7345
+ is an aggregation-based validation, there is exactly one test unit per column.
7346
+
7347
+ When validating multiple columns, each column gets its own validation step:
7348
+
7349
+ ```python
7350
+ validation = (
7351
+ pb.Validate(data=tbl)
7352
+ .col_sd_le(columns=["a", "b"], value=2)
7353
+ .interrogate()
7354
+ )
7355
+
7356
+ validation
7357
+ ```
7358
+
7359
+ Using tolerance for flexible comparisons:
7360
+
7361
+ ```python
7362
+ validation = (
7363
+ pb.Validate(data=tbl)
7364
+ .col_sd_le(columns="a", value=2, tol=1.0)
7365
+ .interrogate()
7366
+ )
7367
+
7368
+ validation
7369
+ ```
7370
+
7371
+ col_sd_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
7372
+ Does the column standard deviation satisfy an equal to comparison?
7373
+
7374
+ The `col_sd_eq()` validation method checks whether the standard deviation of values in a column
7375
+ equals a specified `value=`. This is an aggregation-based validation where the entire
7376
+ column is reduced to a single standard deviation value that is then compared against the target. The
7377
+ comparison used in this function is `standard deviation(column) == value`.
7378
+
7379
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
7380
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
7381
+ the comparison) or fails completely.
7382
+
7383
+ Parameters
7384
+ ----------
7385
+ columns
7386
+ A single column or a list of columns to validate. If multiple columns are supplied,
7387
+ there will be a separate validation step generated for each column. The columns must
7388
+ contain numeric data for the standard deviation to be computed.
7389
+ value
7390
+ The value to compare the column standard deviation against. This can be: (1) a numeric literal
7391
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
7392
+ whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
7393
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
7394
+ `None` to automatically compare against the same column in reference data (shorthand for
7395
+ `ref(column_name)` when reference data is set).
7396
+ tol
7397
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
7398
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
7399
+ a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_sd_eq()` since exact equality
7400
+ comparisons on floating-point aggregations can be problematic due to numerical precision.
7401
+ Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
7402
+ floating-point arithmetic.
7403
+ thresholds
7404
+ Failure threshold levels so that the validation step can react accordingly when
7405
+ failing test units are level. Since this is an aggregation-based validation with only
7406
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
7407
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
7408
+ acceptable.
7409
+ brief
7410
+ An optional brief description of the validation step that will be displayed in the
7411
+ reporting table. You can use the templating elements like `"{step}"` to insert
7412
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
7413
+ the entire brief will be automatically generated. If `None` (the default) then there
7414
+ won't be a brief.
7415
+ actions
7416
+ Optional actions to take when the validation step meets or exceeds any set threshold
7417
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
7418
+ define the actions.
7419
+ active
7420
+ A boolean value indicating whether the validation step should be active. Using `False`
7421
+ will make the validation step inactive (still reporting its presence and keeping indexes
7422
+ for the steps unchanged).
7423
+
7424
+ Returns
7425
+ -------
7426
+ Validate
7427
+ The `Validate` object with the added validation step.
7428
+
7429
+ Using Reference Data
7430
+ --------------------
7431
+ The `col_sd_eq()` method supports comparing column aggregations against reference data. This
7432
+ is useful for validating that statistical properties remain consistent across different
7433
+ versions of a dataset, or for comparing current data against historical baselines.
7434
+
7435
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
7436
+
7437
+ ```python
7438
+ validation = (
7439
+ pb.Validate(data=current_data, reference=baseline_data)
7440
+ .col_sd_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
7441
+ .interrogate()
7442
+ )
7443
+ ```
7444
+
7445
+ When `value=None` and reference data is set, the method automatically compares against the
7446
+ same column in the reference data. You can also explicitly specify reference columns using
7447
+ the `ref()` helper:
7448
+
7449
+ ```python
7450
+ .col_sd_eq(columns="revenue", value=pb.ref("baseline_revenue"))
7451
+ ```
7452
+
7453
+ Understanding Tolerance
7454
+ -----------------------
7455
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
7456
+ floating-point aggregations where exact equality is often unreliable.
7457
+
7458
+ The `tol=` parameter is particularly useful with `col_sd_eq()` since exact equality
7459
+ comparisons on floating-point aggregations can be problematic due to numerical precision.
7460
+ Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
7461
+ floating-point arithmetic.
7462
+
7463
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
7464
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
7465
+ shifts the comparison boundary.
7466
+
7467
+ Thresholds
7468
+ ----------
7469
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
7470
+ step. If they are set here at the step level, these thresholds will override any thresholds
7471
+ set at the global level in `Validate(thresholds=...)`.
7472
+
7473
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
7474
+ validations operate on a single test unit (the aggregated value), threshold values are
7475
+ typically set as absolute counts:
7476
+
7477
+ - `thresholds=1` means any failure triggers a 'warning'
7478
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
7479
+
7480
+ Thresholds can be defined using one of these input schemes:
7481
+
7482
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
7483
+ thresholds)
7484
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
7485
+ the 'error' level, and position `2` is the 'critical' level
7486
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
7487
+ 'critical'
7488
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
7489
+ for the 'warning' level only
7490
+
7491
+ Examples
7492
+ --------
7493
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
7494
+ shown below:
7495
+
7496
+ ```python
7497
+ import pointblank as pb
7498
+ import polars as pl
7499
+
7500
+ tbl = pl.DataFrame(
7501
+ {
7502
+ "a": [1, 2, 3, 4, 5],
7503
+ "b": [2, 2, 2, 2, 2],
7504
+ }
7505
+ )
7506
+
7507
+ pb.preview(tbl)
7508
+ ```
7509
+
7510
+ Let's validate that the standard deviation of column `a` equals `2`:
7511
+
7512
+ ```python
7513
+ validation = (
7514
+ pb.Validate(data=tbl)
7515
+ .col_sd_eq(columns="a", value=2)
7516
+ .interrogate()
7517
+ )
7518
+
7519
+ validation
7520
+ ```
7521
+
7522
+ The validation result shows whether the standard deviation comparison passed or failed. Since this
7523
+ is an aggregation-based validation, there is exactly one test unit per column.
7524
+
7525
+ When validating multiple columns, each column gets its own validation step:
7526
+
7527
+ ```python
7528
+ validation = (
7529
+ pb.Validate(data=tbl)
7530
+ .col_sd_eq(columns=["a", "b"], value=2)
7531
+ .interrogate()
7532
+ )
7533
+
7534
+ validation
7535
+ ```
7536
+
7537
+ Using tolerance for flexible comparisons:
7538
+
7539
+ ```python
7540
+ validation = (
7541
+ pb.Validate(data=tbl)
7542
+ .col_sd_eq(columns="a", value=2, tol=1.0)
7543
+ .interrogate()
7544
+ )
7545
+
7546
+ validation
7547
+ ```
7548
+
7549
+ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4904
7550
 
4905
7551
  Validate whether rows in the table are distinct.
4906
7552
 
@@ -5090,7 +7736,7 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
5090
7736
  others.
5091
7737
 
5092
7738
 
5093
- rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
7739
+ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5094
7740
 
5095
7741
  Validate whether row data are complete by having no missing values.
5096
7742
 
@@ -5280,7 +7926,7 @@ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
5280
7926
  others.
5281
7927
 
5282
7928
 
5283
- col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
7929
+ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5284
7930
 
5285
7931
  Validate whether one or more columns exist in the table.
5286
7932
 
@@ -5632,7 +8278,7 @@ col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnS
5632
8278
  ```python
5633
8279
  validation = (
5634
8280
  pb.Validate(data=tbl)
5635
- .col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3) # Expect 3 Nulls, allow -10%/+30%
8281
+ .col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
5636
8282
  .interrogate()
5637
8283
  )
5638
8284
 
@@ -5643,7 +8289,7 @@ col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnS
5643
8289
  calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
5644
8290
 
5645
8291
 
5646
- col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
8292
+ col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5647
8293
 
5648
8294
  Do columns in the table (and their types) match a predefined schema?
5649
8295
 
@@ -5803,7 +8449,7 @@ col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'boo
5803
8449
  since the table columns and their types match the schema.
5804
8450
 
5805
8451
 
5806
- row_count_match(self, count: 'int | FrameT | Any', tol: 'Tolerance' = 0, inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
8452
+ row_count_match(self, count: 'int | Any', tol: 'Tolerance' = 0, inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5807
8453
 
5808
8454
  Validate whether the row count of the table matches a specified count.
5809
8455
 
@@ -5957,7 +8603,7 @@ row_count_match(self, count: 'int | FrameT | Any', tol: 'Tolerance' = 0, inverse
5957
8603
 
5958
8604
 
5959
8605
 
5960
- col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
8606
+ col_count_match(self, count: 'int | Any', inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5961
8607
 
5962
8608
  Validate whether the column count of the table matches a specified count.
5963
8609
 
@@ -6072,7 +8718,7 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
6072
8718
  columns in the target table. So, the single test unit passed.
6073
8719
 
6074
8720
 
6075
- tbl_match(self, tbl_compare: 'FrameT | Any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
8721
+ tbl_match(self, tbl_compare: 'Any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
6076
8722
 
6077
8723
  Validate whether the target table matches a comparison table.
6078
8724
 
@@ -6295,7 +8941,7 @@ tbl_match(self, tbl_compare: 'FrameT | Any', pre: 'Callable | None' = None, thre
6295
8941
  (one value is different in column `c`).
6296
8942
 
6297
8943
 
6298
- conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
8944
+ conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
6299
8945
 
6300
8946
  Perform multiple row-wise validations for joint validity.
6301
8947
 
@@ -6494,7 +9140,7 @@ conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds:
6494
9140
  information on how to use it with different table backends.
6495
9141
 
6496
9142
 
6497
- specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
9143
+ specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
6498
9144
 
6499
9145
  Perform a specialized validation with customized logic.
6500
9146
 
@@ -6794,7 +9440,7 @@ specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'in
6794
9440
  virtually any data quality requirement in your organization.
6795
9441
 
6796
9442
 
6797
- prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
9443
+ prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
6798
9444
 
6799
9445
  Validate rows using AI/LLM-powered analysis.
6800
9446
 
@@ -7115,7 +9761,7 @@ many steps). Furthermore, the `col()` function can be used to declare a comparis
7115
9761
  for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
7116
9762
  for comparison.
7117
9763
 
7118
- col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals') -> 'Column | ColumnLiteral | ColumnSelectorNarwhals'
9764
+ col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals | nw.selectors.Selector') -> 'Column | ColumnLiteral | ColumnSelectorNarwhals'
7119
9765
 
7120
9766
  Helper function for referencing a column in the input table.
7121
9767
 
@@ -8735,7 +11381,7 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
8735
11381
  `get_first_n=10`.
8736
11382
 
8737
11383
 
8738
- set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None' = None) -> 'Validate'
11384
+ set_tbl(self, tbl: 'Any', tbl_name: 'str | None' = None, label: 'str | None' = None) -> 'Validate'
8739
11385
 
8740
11386
  Set or replace the table associated with the Validate object.
8741
11387
 
@@ -8837,7 +11483,7 @@ set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str |
8837
11483
  ```
8838
11484
 
8839
11485
 
8840
- get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None, incl_footer_timings: 'bool' = None, incl_footer_notes: 'bool' = None) -> 'GT'
11486
+ get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool | None' = None, incl_footer: 'bool | None' = None, incl_footer_timings: 'bool | None' = None, incl_footer_notes: 'bool | None' = None) -> 'GT'
8841
11487
 
8842
11488
  Validation report as a GT table.
8843
11489
 
@@ -9210,7 +11856,7 @@ get_json_report(self, use_fields: 'list[str] | None' = None, exclude_fields: 'li
9210
11856
  failed validation
9211
11857
 
9212
11858
 
9213
- get_sundered_data(self, type='pass') -> 'FrameT'
11859
+ get_sundered_data(self, type='pass') -> 'Any'
9214
11860
 
9215
11861
  Get the data that passed or failed the validation steps.
9216
11862
 
@@ -9246,7 +11892,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
9246
11892
 
9247
11893
  Returns
9248
11894
  -------
9249
- FrameT
11895
+ Any
9250
11896
  A table containing the data that passed or failed the validation steps.
9251
11897
 
9252
11898
  Examples
@@ -9291,7 +11937,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
9291
11937
  that's what we see in the returned DataFrame.
9292
11938
 
9293
11939
 
9294
- get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = False) -> 'dict[int, FrameT | None] | FrameT | None'
11940
+ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = False) -> 'dict[int, Any] | Any'
9295
11941
 
9296
11942
  Get the rows that failed for each validation step.
9297
11943
 
@@ -9314,7 +11960,7 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
9314
11960
 
9315
11961
  Returns
9316
11962
  -------
9317
- dict[int, FrameT | None] | FrameT | None
11963
+ dict[int, Any] | Any
9318
11964
  A dictionary of tables containing the rows that failed in every compatible validation
9319
11965
  step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
9320
11966
 
@@ -10471,7 +13117,7 @@ datasets included in the package can be accessed via the `load_dataset()` functi
10471
13117
  `config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
10472
13118
  the `assistant()` function to get help with Pointblank.
10473
13119
 
10474
- DataScan(data: 'IntoFrameT', tbl_name: 'str | None' = None) -> 'None'
13120
+ DataScan(data: 'Any', tbl_name: 'str | None' = None) -> 'None'
10475
13121
 
10476
13122
  Get a summary of a dataset.
10477
13123
 
@@ -10567,7 +13213,7 @@ DataScan(data: 'IntoFrameT', tbl_name: 'str | None' = None) -> 'None'
10567
13213
  A DataScan object.
10568
13214
 
10569
13215
 
10570
- preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None' = None, n_head: 'int' = 5, n_tail: 'int' = 5, limit: 'int' = 50, show_row_numbers: 'bool' = True, max_col_width: 'int' = 250, min_tbl_width: 'int' = 500, incl_header: 'bool' = None) -> 'GT'
13216
+ preview(data: 'Any', columns_subset: 'str | list[str] | Column | None' = None, n_head: 'int' = 5, n_tail: 'int' = 5, limit: 'int' = 50, show_row_numbers: 'bool' = True, max_col_width: 'int' = 250, min_tbl_width: 'int' = 500, incl_header: 'bool | None' = None) -> 'GT'
10571
13217
 
10572
13218
  Display a table preview that shows some rows from the top, some from the bottom.
10573
13219
 
@@ -10766,7 +13412,7 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
10766
13412
  function.
10767
13413
 
10768
13414
 
10769
- col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
13415
+ col_summary_tbl(data: 'Any', tbl_name: 'str | None' = None) -> 'GT'
10770
13416
 
10771
13417
  Generate a column-level summary table of a dataset.
10772
13418
 
@@ -10843,7 +13489,7 @@ col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
10843
13489
  ```
10844
13490
 
10845
13491
 
10846
- missing_vals_tbl(data: 'FrameT | Any') -> 'GT'
13492
+ missing_vals_tbl(data: 'Any') -> 'GT'
10847
13493
 
10848
13494
  Display a table that shows the missing values in the input table.
10849
13495
 
@@ -10917,7 +13563,7 @@ missing_vals_tbl(data: 'FrameT | Any') -> 'GT'
10917
13563
  sector. Many columns have no missing values at all, and those sectors are colored light blue.
10918
13564
 
10919
13565
 
10920
- assistant(model: 'str', data: 'FrameT | Any | None' = None, tbl_name: 'str | None' = None, api_key: 'str | None' = None, display: 'str | None' = None) -> 'None'
13566
+ assistant(model: 'str', data: 'Any' = None, tbl_name: 'str | None' = None, api_key: 'str | None' = None, display: 'str | None' = None) -> 'None'
10921
13567
 
10922
13568
  Chat with the PbA (Pointblank Assistant) about your data validation needs.
10923
13569
 
@@ -11061,7 +13707,7 @@ assistant(model: 'str', data: 'FrameT | Any | None' = None, tbl_name: 'str | Non
11061
13707
  library. The loading preference is Polars first, then Pandas as a fallback.
11062
13708
 
11063
13709
 
11064
- load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> 'FrameT | Any'
13710
+ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> 'Any'
11065
13711
 
11066
13712
  Load a dataset hosted in the library as specified table type.
11067
13713
 
@@ -11082,7 +13728,7 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'glo
11082
13728
 
11083
13729
  Returns
11084
13730
  -------
11085
- FrameT | Any
13731
+ Any
11086
13732
  The dataset for the `Validate` object. This could be a Polars DataFrame, a Pandas DataFrame,
11087
13733
  or a DuckDB table as an Ibis table.
11088
13734
 
@@ -11374,7 +14020,7 @@ from YAML strings or files. The `validate_yaml()` function checks if the YAML co
11374
14020
  its own validity checks. The `yaml_to_python()` function converts YAML configuration to equivalent
11375
14021
  Python code.
11376
14022
 
11377
- yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate'
14023
+ yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Any' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate'
11378
14024
  Execute a YAML-based validation workflow.
11379
14025
 
11380
14026
  This is the main entry point for YAML-based validation workflows. It takes YAML configuration
@@ -11863,7 +14509,7 @@ columns or rows in a table. The `get_action_metadata()` function is useful when
11863
14509
  actions since it returns metadata about the validation step that's triggering the action. Lastly,
11864
14510
  the `config()` utility lets us set global configuration parameters.
11865
14511
 
11866
- get_column_count(data: 'FrameT | Any') -> 'int'
14512
+ get_column_count(data: 'Any') -> 'int'
11867
14513
 
11868
14514
  Get the number of columns in a table.
11869
14515
 
@@ -11978,7 +14624,7 @@ get_column_count(data: 'FrameT | Any') -> 'int'
11978
14624
  `8` for the `small_table` dataset.
11979
14625
 
11980
14626
 
11981
- get_row_count(data: 'FrameT | Any') -> 'int'
14627
+ get_row_count(data: 'Any') -> 'int'
11982
14628
 
11983
14629
  Get the number of rows in a table.
11984
14630
 
@@ -12602,7 +15248,7 @@ send a Slack notification when validation steps exceed failure threshold levels
12602
15248
  summary of the validation results, including the status, number of steps, passing and failing steps,
12603
15249
  table information, and timing details.
12604
15250
 
12605
- send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None' = None, summary_msg: 'str | None' = None, debug: 'bool' = False) -> 'Callable'
15251
+ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None' = None, summary_msg: 'str | None' = None, debug: 'bool' = False) -> 'Callable | None'
12606
15252
 
12607
15253
  Create a Slack notification function using a webhook URL.
12608
15254